aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2020-11-02 01:37:17 +0000
committerGeorgios Pinitas <georgios.pinitas@arm.com>2020-11-12 15:59:25 +0000
commitc0b6f76561580414f08633a804fc548ccad65659 (patch)
tree4d46b7f479de04f799e29095392948aeb370c029
parent824061d9910ebb42cbe46b677c0b843db212c9a2 (diff)
downloadComputeLibrary-c0b6f76561580414f08633a804fc548ccad65659.tar.gz
COMPMID-3776: Indirect GEMM
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Change-Id: I51a1b0f098bc3a8c408c50c92221e4df3061e12c Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4343 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--Android.bp129
-rw-r--r--arm_compute/core/Types.h9
-rw-r--r--arm_compute/runtime/FunctionDescriptors.h24
-rw-r--r--arm_compute/runtime/NEON/NEFunctions.h2
-rw-r--r--arm_compute/runtime/NEON/functions/NEConvolutionLayer.h9
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h58
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConv2d.h108
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h74
-rw-r--r--docs/06_functions_list.dox3
-rw-r--r--src/core/NEON/NEKernels.h1
-rw-r--r--src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h89
-rw-r--r--src/core/NEON/kernels/arm_gemm/convolver.hpp182
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp81
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp46
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp120
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp52
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp621
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp15
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp265
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp28
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int16.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_int8.cpp107
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp896
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp3
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp143
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp126
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp106
-rw-r--r--src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp86
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp151
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp193
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp225
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp225
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp213
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp270
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp212
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp196
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp282
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp306
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp286
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp322
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp306
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp286
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp322
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp247
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp181
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp223
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp343
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp370
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp370
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp319
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp362
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp362
-rw-r--r--src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp48
-rw-r--r--src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp409
-rw-r--r--src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp43
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp)11
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp)17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp)17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp5
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp)17
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp)23
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp1546
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp)18
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp86
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp3668
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp5400
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp2427
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp1802
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp1810
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp1934
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp)61
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp3430
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp2195
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp)49
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp2072
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp3613
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp2434
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp1808
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp3335
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp)49
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp2072
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp2434
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp1808
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp3335
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp328
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp)13
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp)28
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp)26
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp)10
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp)10
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp)1854
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp)1558
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp)1360
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp)1076
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp)1854
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp)1558
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp)12
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp)1360
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp)1076
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp)29
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp1372
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp2247
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp)49
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp2237
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp3459
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp1633
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp89
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp2001
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp3778
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp)48
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp3178
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp2118
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp2236
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp)50
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp1751
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp3459
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp)46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp1602
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp2770
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp2137
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp1904
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp)46
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp1602
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp2137
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp85
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp1904
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp)9
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp)2
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp)4
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp)4250
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp)8
-rw-r--r--src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp (renamed from src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp)4250
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantized.cpp173
-rw-r--r--src/core/NEON/kernels/arm_gemm/quantized.hpp6
-rw-r--r--src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp1160
-rw-r--r--src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp1160
-rw-r--r--src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp20
-rw-r--r--src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp21
-rw-r--r--src/core/NEON/kernels/arm_gemm/transform.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp167
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp10
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp128
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp182
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp191
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp228
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp207
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp224
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp224
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp8
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/list.hpp14
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp596
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp596
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp596
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp596
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp596
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp596
-rw-r--r--src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp2
-rw-r--r--src/core/NEON/kernels/arm_gemm/utils.hpp93
-rw-r--r--src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp89
-rw-r--r--src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h108
-rw-r--r--src/core/NEON/kernels/assembly/arm_gemm.hpp18
-rw-r--r--src/core/NEON/kernels/assembly/convolution_parameters.hpp65
-rw-r--r--src/core/NEON/kernels/assembly/gemm_common.hpp26
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp61
-rw-r--r--src/runtime/NEON/functions/NEGEMM.cpp26
-rw-r--r--src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp328
-rw-r--r--src/runtime/NEON/functions/NEGEMMConv2d.cpp167
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp142
-rw-r--r--src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp28
-rw-r--r--src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp46
-rw-r--r--src/runtime/NEON/functions/NESimpleAssemblyFunction.h56
-rw-r--r--tests/validation/NEON/ConvolutionLayer.cpp97
-rw-r--r--tests/validation/NEON/GEMMLowp.cpp23
-rw-r--r--tests/validation/fixtures/ConvolutionLayerFixture.h16
235 files changed, 79188 insertions, 53560 deletions
diff --git a/Android.bp b/Android.bp
index 8d931c23c8..98b00cf5ba 100644
--- a/Android.bp
+++ b/Android.bp
@@ -367,10 +367,12 @@ cc_library_static {
"src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp",
"src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp",
+ "src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp",
"src/core/NEON/kernels/arm_gemm/mergeresults.cpp",
"src/core/NEON/kernels/arm_gemm/misc.cpp",
"src/core/NEON/kernels/arm_gemm/quantized.cpp",
- "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp",
+ "src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp",
+ "src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp",
"src/core/NEON/kernels/convolution/common/padding.cpp",
"src/core/NEON/kernels/convolution/common/qasymm8.cpp",
"src/core/NEON/kernels/convolution/common/qsymm8.cpp",
@@ -669,9 +671,9 @@ cc_library_static {
"src/runtime/NEON/functions/NEFuseBatchNormalization.cpp",
"src/runtime/NEON/functions/NEGEMM.cpp",
"src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp",
+ "src/runtime/NEON/functions/NEGEMMConv2d.cpp",
"src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp",
"src/runtime/NEON/functions/NEGEMMInterleave4x4.cpp",
- "src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp",
"src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp",
"src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp",
"src/runtime/NEON/functions/NEGEMMTranspose1xW.cpp",
@@ -727,7 +729,6 @@ cc_library_static {
"src/runtime/NEON/functions/NEScale.cpp",
"src/runtime/NEON/functions/NEScharr3x3.cpp",
"src/runtime/NEON/functions/NESelect.cpp",
- "src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp",
"src/runtime/NEON/functions/NESlice.cpp",
"src/runtime/NEON/functions/NESobel3x3.cpp",
"src/runtime/NEON/functions/NESobel5x5.cpp",
@@ -779,69 +780,71 @@ cc_library_static {
},
arm64: {
srcs: [
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp",
"src/core/NEON/kernels/arm_gemm/kernels/a64_sgemv_pretransposed/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp",
- "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp",
+ "src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp",
],
},
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 306bdc6706..2e639c4be4 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -137,10 +137,11 @@ enum class DataLayoutDimension
/** Available ConvolutionMethod*/
enum class ConvolutionMethod
{
- GEMM, /**< Convolution using GEMM */
- DIRECT, /**< Direct convolution */
- WINOGRAD, /**< Convolution using Winograd */
- FFT /**< Convolution using FFT */
+ GEMM, /**< Convolution using GEMM */
+ GEMM_CONV2D, /**< Direct 2D GEMM convolution */
+ DIRECT, /**< Direct convolution */
+ WINOGRAD, /**< Convolution using Winograd */
+ FFT /**< Convolution using FFT */
};
/** Available DepthwiseConvolutionFunction*/
diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h
index 16d6c345e2..1f4216eb21 100644
--- a/arm_compute/runtime/FunctionDescriptors.h
+++ b/arm_compute/runtime/FunctionDescriptors.h
@@ -23,6 +23,9 @@
*/
#ifndef ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
#define ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H
+
+#include "arm_compute/core/Types.h"
+
#include <utility>
namespace arm_compute
@@ -48,5 +51,26 @@ struct FFT2DInfo
unsigned int axis1{ 1 }; /**< Axis to run second pass on. If same, multiple transforms are performed on single axis*/
FFTDirection direction{ FFTDirection::Forward }; /**< Direction of the FFT. */
};
+
+/** Descriptor used by the Convolution function */
+struct Conv2dInfo
+{
+ Conv2dInfo() = default;
+
+ Conv2dInfo(const PadStrideInfo &conv_info,
+ const Size2D &dilation,
+ const ActivationLayerInfo &act_info,
+ bool enable_fast_math,
+ unsigned int num_groups)
+ : conv_info(conv_info), dilation(dilation), act_info(act_info), enable_fast_math(enable_fast_math), num_groups(num_groups)
+ {
+ }
+
+ PadStrideInfo conv_info{};
+ Size2D dilation{ 1U, 1U };
+ ActivationLayerInfo act_info{};
+ bool enable_fast_math{ false };
+ unsigned int num_groups{ 1 };
+};
} // namespace arm_compute
#endif /* ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H */
diff --git a/arm_compute/runtime/NEON/NEFunctions.h b/arm_compute/runtime/NEON/NEFunctions.h
index a97fa3b81a..e7d59e1608 100644
--- a/arm_compute/runtime/NEON/NEFunctions.h
+++ b/arm_compute/runtime/NEON/NEFunctions.h
@@ -78,9 +78,9 @@
#include "arm_compute/runtime/NEON/functions/NEFuseBatchNormalization.h"
#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMInterleave4x4.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMTranspose1xW.h"
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index 54dae57752..a061dc7b04 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -26,16 +26,15 @@
#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/core/ITensorInfo.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
#include <memory>
namespace arm_compute
{
+// Forward declarations
class ITensor;
/** Basic function to simulate a convolution layer. This function calls one of the following NEON functions:
@@ -158,5 +157,5 @@ private:
std::shared_ptr<IMemoryManager> _memory_manager;
std::unique_ptr<IFunction> _function; /**< Function to run */
};
-}
+} // namespace arm_compute
#endif /* ARM_COMPUTE_NECONVOLUTIONLAYER_H */ \ No newline at end of file
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
index ac77acf69d..8f9498d0f5 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h
@@ -32,6 +32,28 @@
namespace arm_compute
{
+/* Convolution method supported by the assembly gemm interface */
+enum class AsmConvMethod
+{
+ Im2Col,
+ Indirect,
+ Conv
+};
+
+struct AsmGemmInfo
+{
+ AsmConvMethod method{ AsmConvMethod::Im2Col };
+ PadStrideInfo ps_info{};
+ ActivationLayerInfo activation_info{};
+ GEMMLowpOutputStageInfo output_stage{};
+ bool negated_offsets{ true };
+ bool reinterpret_input_as_3d{ false };
+ bool depth_output_gemm3d{ false };
+ int64_t padding_top{ 0 };
+ int64_t padding_left{ 0 };
+ float padding_value{ 0.f };
+};
+
/** Assembly kernel glue */
class NEGEMMAssemblyDispatch : public IFunction
{
@@ -55,33 +77,28 @@ public:
virtual ~IFallback() = default;
};
-private:
- /** Interface for the arm_gemm fallback */
- std::unique_ptr<IFallback> _arm_gemm;
- MemoryGroup _memory_group; /**< Function memory group */
- IWeightsManager *_weights_manager; /**< Pointer to the weights manager */
public:
/** If supported create a Compute Library function else fallback to the arm_gemm function.
*
- * @param[in] a Input tensor (Matrix A)
- * @param[in] b Input tensor (Matrix B)
- * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations
- * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
- * @param[in] gemm_info GEMM meta-data
+ * @param[in] a Input tensor (Matrix A)
+ * @param[in] b Input tensor (Matrix B)
+ * @param[in] c Input tensor (Matrix C) used to pass the bias for quantized calculations
+ * @param[out] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+ * @param[in] info GEMM meta-data
*/
- void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info);
+ void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info);
/** Indicates whether or not this function can be used to process the given parameters.
*
- * @param[in] a Input tensor info (Matrix A)
- * @param[in] b Input tensor info (Matrix B)
- * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations
- * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
- * @param[in] gemm_info GEMM meta-data
+ * @param[in] a Input tensor info (Matrix A)
+ * @param[in] b Input tensor info (Matrix B)
+ * @param[in] c Input tensor info (Matrix C) used to pass the bias for quantized calculations
+ * @param[in] d Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
+ * @param[in] info GEMM meta-data
*
* @return a status.
*/
- static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info);
+ static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
/** Checks if activation is supported by the gemm assembly dispatcher
*
* @param[in] activation Activation to check
@@ -94,10 +111,15 @@ public:
* @return True if the function is configured and ready to run
*/
bool is_configured() const;
+
// Inherited methods overridden:
- /** Runs a preparation step, usually for pre-transposing matrix b */
void prepare() override;
void run() override;
+
+private:
+ std::unique_ptr<IFallback> _arm_gemm; /** Interface for the arm_gemm fallback */
+ MemoryGroup _memory_group; /**< Function memory group */
+ IWeightsManager *_weights_manager; /**< Pointer to the weights manager */
};
} // namespace arm_compute
#endif /* ARM_COMPUTE_NEGEMMASSEMBLYDISPATCH_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
new file mode 100644
index 0000000000..7cae39397f
--- /dev/null
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_NEGEMMCONV2D_H
+#define ARM_COMPUTE_NEGEMMCONV2D_H
+
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEPermute.h"
+#include "arm_compute/runtime/Tensor.h"
+
+#include <memory>
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+/** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions:
+ *
+ * Supports only NHWC data layout
+ *
+ * -# @ref NEGEMMAssemblyDispatch
+ * -# @ref NEActivationLayer, in case activation cannot be fused in the assembly dispatch
+ *
+ * Weights are transformed from OHWI to HWIO format using the following kernels:
+ * -# @ref NEPermute
+ */
+class NEGEMMConv2d : public IFunction
+{
+public:
+ /** Constructor */
+ NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMConv2d(const NEGEMMConv2d &) = delete;
+ /** Default move constructor */
+ NEGEMMConv2d(NEGEMMConv2d &&) = default;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ NEGEMMConv2d &operator=(const NEGEMMConv2d &) = delete;
+ /** Default move assignment operator */
+ NEGEMMConv2d &operator=(NEGEMMConv2d &&) = default;
+ /** Set the input and output tensors.
+ *
+ * @param[in] input Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] output Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] info Convolution layer descriptor
+ */
+ void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConv2d
+ *
+ * @param[in] input Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[in] output Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] info Contains padding and stride information described in @ref PadStrideInfo.
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info);
+
+ // Inherited methods overridden:
+ void run() override;
+ void prepare() override;
+
+private:
+ NEGEMMAssemblyDispatch _gemm_asm_func;
+ NEActivationLayer _activation_func;
+ NEPermute _weights_permute_func;
+ const ITensor *_original_weights;
+ Tensor _permuted_weights;
+ bool _is_prepared;
+ bool _run_activation;
+};
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_NEGEMMCONV2D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
deleted file mode 100644
index 961b1901e7..0000000000
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-#define ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H
-
-#include "arm_compute/runtime/IFunction.h"
-#include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
-#include "arm_compute/runtime/Tensor.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-// Forward declarations
-class ITensor;
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMTranspose1xWKernel;
-class NEGEMMLowpMatrixMultiplyKernel;
-
-/** Basic function to execute matrix multiply assembly kernels. */
-class NEGEMMLowpAssemblyMatrixMultiplyCore : public IFunction
-{
-public:
- /** Constructor */
- NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
- /** Destructor */
- ~NEGEMMLowpAssemblyMatrixMultiplyCore();
-
- /** Initialise the kernel's inputs, output
- *
- * @param[in] a First input tensor (Matrix A). Data type supported: U8, S8.
- * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a
- * @param[in] c Third input tensor (Matrix C). Data type supported: same as @p a
- * @param[out] output Output tensor. Data type supported: Data type supported: U32, S32
- */
- void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- MemoryGroup _memory_group;
- NEGEMMAssemblyDispatch _asm_glue;
- std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel> _mm_kernel;
- std::unique_ptr<NEGEMMInterleave4x4Kernel> _mtx_a_reshape_kernel;
- std::unique_ptr<NEGEMMTranspose1xWKernel> _mtx_b_reshape_kernel;
- Tensor _tmp_a;
- Tensor _tmp_b;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMLOWPASSEMBLYMATRIXMULTIPLYCORE_H */
diff --git a/docs/06_functions_list.dox b/docs/06_functions_list.dox
index ac944610dc..e6924211e2 100644
--- a/docs/06_functions_list.dox
+++ b/docs/06_functions_list.dox
@@ -141,8 +141,8 @@ namespace arm_compute
- @ref NEGaussianPyramidOrb
- @ref NEGEMM
- @ref NEGEMMAssemblyDispatch
+ - @ref NEGEMMConv2d
- @ref NEGEMMConvolutionLayer
- - @ref NEGEMMLowpAssemblyMatrixMultiplyCore
- @ref NEGEMMLowpMatrixMultiplyCore
- @ref NEGenerateProposalsLayer
- @ref NEHarrisCorners
@@ -173,7 +173,6 @@ namespace arm_compute
- @ref NERNNLayer
- @ref NEROIPoolingLayer
- @ref NEScale
- - @ref NESimpleAssemblyFunction
- @ref NESobel5x5
- @ref NESobel7x7
- @ref NESoftmaxLayerGeneric &lt;IS_LOG&gt;
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 67562933d4..79c4bcea25 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -72,7 +72,6 @@
#include "src/core/NEON/kernels/NEFlattenLayerKernel.h"
#include "src/core/NEON/kernels/NEFloorKernel.h"
#include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
diff --git a/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h b/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
deleted file mode 100644
index 775a2c06ab..0000000000
--- a/src/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-#define ARM_COMPUTE_NEGEMMASSEMBLYBASE_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Base class for GEMM NEON kernels implemented in Assembly. */
-class NEGEMMAssemblyBaseKernel : public INEKernel
-{
-public:
- const char *name() const override
- {
- return "NEGEMMAssemblyBaseKernel";
- }
- /** Constructor */
- NEGEMMAssemblyBaseKernel()
- : _input0(nullptr), _input1(nullptr), _output(nullptr), _workspace(nullptr), _alpha(1.f), _beta(0.f), _is_transposed_0(false), _is_transposed_1(false)
- {
- }
-
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMAssemblyBaseKernel(const NEGEMMAssemblyBaseKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEGEMMAssemblyBaseKernel &operator=(const NEGEMMAssemblyBaseKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEGEMMAssemblyBaseKernel(NEGEMMAssemblyBaseKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEGEMMAssemblyBaseKernel &operator=(NEGEMMAssemblyBaseKernel &&) = default;
-
- virtual ~NEGEMMAssemblyBaseKernel() = default;
-
- /** Initialise the kernel's input and output.
- *
- * The computed function is C = a * AxB + b * C.
- *
- * @param[in] input0 Input tensor containing the Matrix A. Data types supported: F32
- * @param[in] input1 Input tensor containing the Matrix B. Data types supported: same as @p input0
- * @param[in,out] output Output tensor to store the result of matrix multiplication. If @p beta is not zero the values are multiplied by @p beta before the result is accumulated. Otherwise the values are overwritten by the result. Data types supported: same as @p input0.
- * @param[out] workspace Space for intermediate results.
- * @param[in] alpha Weight of the matrix product
- * @param[in] beta Weight of the accumulation.
- * @param[in] is_transposed_0 (Optional)True if @p input0 is transposed else false. (Defaults to false)
- * @param[in] is_transposed_1 (Optional)True if @p input1 is transposed else false. (Defaults to false)
- */
- void configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha = 1.f, float beta = 0.f, bool is_transposed_0 = false, bool is_transposed_1 = false)
- {
- internal_configure(input0, input1, output, workspace, alpha, beta, is_transposed_0, is_transposed_1);
- }
-
-protected:
- virtual void internal_configure(const ITensor *input0, const ITensor *input1, ITensor *output, ITensor *workspace, float alpha, float beta, bool _is_transposed_0, bool _is_transposed_1) = 0;
-
- const ITensor *_input0;
- const ITensor *_input1;
- ITensor *_output;
- ITensor *_workspace;
- float _alpha;
- float _beta;
- bool _is_transposed_0;
- bool _is_transposed_1;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEGEMMASSEMBLYBASE_H*/
diff --git a/src/core/NEON/kernels/arm_gemm/convolver.hpp b/src/core/NEON/kernels/arm_gemm/convolver.hpp
new file mode 100644
index 0000000000..1cd959523f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/convolver.hpp
@@ -0,0 +1,182 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "convolution_parameters.hpp"
+
+#include <algorithm>
+#include <cstddef>
+#include <tuple>
+#include <vector>
+
+namespace arm_gemm {
+
+// Class to assist with convolution calculations.
+//
+// This is framed as a hierarchy of objects:
+//
+// - Top level object which depends only on convolution parameters. This sets up std::vectors for the padding and
+// kernel offset arrays. From this you can request:
+//
+// - Mid level object (e.g. instantiated at start of 'ConvolutionInterleave'). This holds specifics about the
+// input tensor, and the desired column range. Calculations specific to this can be done once when this is set
+// up. From this you can request:
+//
+// - Low level object (instantiated for each range of rows). This contains methods to actually populate a row
+// pointer array.
+
+
+template<typename T>
+class convolver {
+private:
+ const ConvolutionParameters m_params;
+
+ // Vector of padding data
+ const std::vector<T> m_pad_row;
+
+ // X/Y offsets for each kernel position
+ std::vector<int> m_kernel_y;
+ std::vector<int> m_kernel_x;
+
+ class column_handler {
+ private:
+ const convolver<T> &m_parent;
+
+ // Base/stride of input image
+ const T * const m_input_base;
+ const size_t m_input_stride;
+
+ // Starting kernel point and channel offset within that point
+ const unsigned int m_start_pos;
+ const unsigned int m_start_offset;
+
+ // Total length to process, rounded length of each input channel block.
+ const unsigned int m_length;
+ const unsigned int m_rounded_stringlen;
+
+ class row_handler {
+ private:
+ const convolver<T> &m_convolver;
+ const column_handler &m_parent;
+
+ // These variables track progress through the current block of rows
+ unsigned int m_start_output_y=0;
+ unsigned int m_start_output_x=0;
+
+ unsigned int m_length_remaining=0;
+ unsigned int m_current_pos=0;
+
+ unsigned int m_active_height=0;
+
+ public:
+ row_handler(const column_handler &parent, unsigned int start_row, unsigned int active_height) :
+ m_convolver(parent.m_parent),
+ m_parent(parent),
+ m_start_output_y(start_row / m_convolver.m_params.output_width),
+ m_start_output_x(start_row % m_convolver.m_params.output_width),
+ m_length_remaining(m_parent.m_length),
+ m_current_pos(m_parent.m_start_pos),
+ m_active_height(active_height) { }
+
+ bool finished() const {
+ return (m_length_remaining == 0);
+ }
+
+ std::tuple<unsigned int, unsigned int> next_block(const T ** const row_ptr) {
+ if (finished()) {
+ return { 0, 0 };
+ }
+
+ // "in_width" in the amount of data that will be read in (copied)
+ // "out_width" is the total amount of data that will be produced (including padding)
+ unsigned int offset = (m_current_pos == m_parent.m_start_pos) ? m_parent.m_start_offset : 0;
+ unsigned int in_width = std::min(m_length_remaining, static_cast<unsigned int>(m_convolver.m_params.input_channels) - offset);
+ unsigned int out_width = std::min(m_length_remaining, m_parent.m_rounded_stringlen - offset);
+
+ unsigned int output_y = m_start_output_y;
+ unsigned int output_x = m_start_output_x;
+
+ for (unsigned int row=0; row<m_active_height; row++) {
+ int input_y = (output_y * m_convolver.m_params.output_stride_h) + m_convolver.m_kernel_y[m_current_pos];
+ int input_x = (output_x * m_convolver.m_params.output_stride_w) + m_convolver.m_kernel_x[m_current_pos];
+
+ // Out-of-bounds points will read the padding data,
+ // otherwise find the correct address in the input image.
+ if (input_y < 0 || input_y >= m_convolver.m_params.input_height || input_x < 0 || input_x >= m_convolver.m_params.input_width) {
+ row_ptr[row] = m_convolver.m_pad_row.data();
+ } else {
+ row_ptr[row] = m_parent.m_input_base + ((input_y * m_convolver.m_params.input_width) + input_x) * m_parent.m_input_stride;
+ }
+
+ output_x++;
+ if (output_x == m_convolver.m_params.output_width) {
+ output_y++;
+ output_x=0;
+ }
+ }
+
+ m_current_pos++;
+ m_length_remaining-=out_width;
+
+ return { in_width, offset };
+ }
+ }; // end of "row handler" class
+
+ public:
+ column_handler(const convolver<T> &parent, const T *input_base, size_t input_stride,
+ unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen)
+ : m_parent(parent), m_input_base(input_base), m_input_stride(input_stride),
+ m_start_pos(k_start / rounded_stringlen),
+ m_start_offset(k_start % rounded_stringlen),
+ m_length(k_end - k_start),
+ m_rounded_stringlen(rounded_stringlen) { }
+
+ row_handler process_rows(unsigned int start_row, unsigned int active_height) const {
+ return row_handler(*this, start_row, active_height);
+ }
+ }; // end of "column handler" class
+
+public:
+ convolver(ConvolutionParameters params) :
+ m_params (params), m_pad_row(params.input_channels, static_cast<T>(params.padding_value)),
+ m_kernel_y(params.kernel_width * params.kernel_height, 0),
+ m_kernel_x(params.kernel_width * params.kernel_height, 0) {
+
+ // Kernel points are addressed across, then down (assumed weight layout is WHIO)
+ for (unsigned int ky=0; ky<params.kernel_height; ky++) {
+ for (unsigned int kx=0; kx<params.kernel_width; kx++) {
+ unsigned int n = (ky * params.kernel_width) + kx;
+ m_kernel_y[n] = ky - params.padding_top;
+ m_kernel_x[n] = kx - params.padding_left;
+ }
+ }
+ }
+
+ column_handler process_columns(const T *input_base, size_t input_stride,
+ unsigned int k_start, unsigned int k_end, unsigned int rounded_stringlen) const {
+ return column_handler(*this, input_base, input_stride, k_start, k_end, rounded_stringlen);
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
index f3b66528a4..96b9734221 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_bf16.cpp
@@ -25,93 +25,78 @@
#include "bfloat.hpp"
#include "gemm_common.hpp"
#include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
#include "gemv_batched.hpp"
#include "gemv_pretransposed.hpp"
-#include "kernels/a64_interleaved_bf16fp32_dot_12x8.hpp"
-#include "kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
+#include "kernels/a64_hybrid_bf16fp32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_bf16fp32_dot_8x12.hpp"
+#include "kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp"
-#include "kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp"
-#include "kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp"
+#include "kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp"
namespace arm_gemm {
static const GemmImplementation<bfloat16, float> gemm_bf16_methods[] =
{
#ifdef V8P6_BF
-# ifdef __ARM_FEATURE_SVE
-{
- GemmMethod::GEMM_HYBRID,
- "hybrid_bf16fp32_mmla_6VLx2",
- [](const GemmArgs &args) { return (args._Ksize>=8); },
- [](const GemmArgs &args) { return ((args._Msize <= 4) && (args._Nsize <= hybrid_bf16fp32_mmla_6VLx2::out_width())); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_6VLx2, bfloat16, float>(args); }
-},
-{
- GemmMethod::GEMM_HYBRID,
- "hybrid_bf16fp32_mmla_8VLx2",
- [](const GemmArgs &args) { return (args._Ksize>=8); },
- [](const GemmArgs &args) { return (args._Msize <= 4); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_8VLx2, bfloat16, float>(args); }
-},
-{
- GemmMethod::GEMM_HYBRID,
- "hybrid_bf16fp32_mmla_4VLx4",
- [](const GemmArgs &args) { return (args._Ksize>=8); },
- [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_mmla_4VLx4, bfloat16, float>(args); }
-},
-{
- GemmMethod::GEMM_HYBRID,
- "hybrid_bf16fp32_dot_4VLx4",
- [](const GemmArgs &args) { return (args._Ksize>=8); },
- [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_bf16fp32_dot_4VLx4, bfloat16, float>(args); }
-},
+#ifdef __ARM_FEATURE_SVE
{ // gemm_bf16_interleaved
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_bf16fp32_mmla_3VLx8",
+ "sve_interleaved_bf16fp32_mmla_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>4); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_mmla_3VLx8, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_mmla_8x3VL, bfloat16, float>(args); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "sve_hybrid_bf16fp32_dot_6x4VL",
+ nullptr,
+ [](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_bf16fp32_dot_6x4VL, bfloat16, float>(args); }
},
{ // gemm_bf16_interleaved
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_bf16fp32_dot_3VLx8",
+ "sve_interleaved_bf16fp32_dot_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>2); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_dot_3VLx8, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_bf16fp32_dot_8x3VL, bfloat16, float>(args); }
},
# endif // SVE
{ // gemm_bf16_interleaved
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_bf16fp32_mmla_12x8",
+ "a64_interleaved_bf16fp32_mmla_8x12",
[](const GemmArgs &args) { return (args._Ksize>4); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_mmla_12x8, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_mmla_8x12, bfloat16, float>(args); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_bf16fp32_dot_6x16",
+ nullptr,
+ nullptr,
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_bf16fp32_dot_6x16, bfloat16, float>(args); }
},
{ // gemm_bf16_interleaved
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_bf16fp32_dot_12x8",
+ "a64_interleaved_bf16fp32_dot_8x12",
[](const GemmArgs &args) { return (args._Ksize>2); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_bf16fp32_dot_12x8, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_bf16fp32_dot_8x12, bfloat16, float>(args); }
},
#endif // V8P6_BF
#ifdef __aarch64__
{
GemmMethod::GEMM_INTERLEAVED,
- "sgemm_12x8",
+ "a64_sgemm_8x12",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, bfloat16, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, bfloat16, float>(args); }
},
#elif defined(__arm__)
{
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
index 91012218e5..de2e4f2c2b 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp16.cpp
@@ -29,15 +29,17 @@
#include "gemm_common.hpp"
#include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
#include "gemm_interleaved_pretransposed_2d.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_hgemm_24x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/sve_hybrid_fp16_mla_4VLx4.hpp"
-#include "kernels/sve_interleaved_fp16_mla_3VLx8.hpp"
+#include "kernels/a64_hgemm_8x24.hpp"
+#include "kernels/a64_hybrid_fp16_mla_6x32.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
+#include "kernels/sve_hybrid_fp16_mla_6x4VL.hpp"
+#include "kernels/sve_interleaved_fp16_mla_8x3VL.hpp"
namespace arm_gemm {
@@ -45,61 +47,51 @@ static const GemmImplementation<__fp16, __fp16> gemm_fp16_methods[] = {
#if defined(__ARM_FEATURE_SVE)
{
GemmMethod::GEMM_HYBRID,
- "hybrid_fp16_mla_4VLx4",
- [](const GemmArgs &args) { return (args._Ksize >= 8); },
+ "sve_hybrid_fp16_mla_6x4VL",
+ nullptr,
[](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp16_mla_4VLx4, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_fp16_mla_3VLx8",
+ "sve_interleaved_fp16_mla_8x3VL",
[](const GemmArgs &args) { return (args._Ksize > 4); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp16_mla_3VLx8, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp16_mla_8x3VL, __fp16, __fp16>(args); }
},
#endif
#if defined(__aarch64__) && (defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || defined(FP16_KERNELS))
{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "hgemm_24x8_2d",
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_fp16_mla_6x32",
#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
[](const GemmArgs &args) { return args._ci->has_fp16(); },
#else
nullptr,
#endif
- [](const GemmArgs &args) { return args._maxthreads >= 8; },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<hgemm_24x8, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp16_mla_6x32, __fp16, __fp16>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "hgemm_24x8_1d",
+ "a64_hgemm_8x24",
#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
[](const GemmArgs &args) { return args._ci->has_fp16(); },
#else
nullptr,
#endif
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<hgemm_24x8, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_hgemm_8x24, __fp16, __fp16>(args); }
},
-
#endif // aarch64 && FP16
#ifdef __aarch64__
-//Pretranpose, 2D split
-{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "sgemm_12x8_2d",
- nullptr,
- [](const GemmArgs &args) { return args._maxthreads >= 8; },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, __fp16, __fp16>(args); }
-},
-//Tranpose, 1D split, with blockmanager
{
GemmMethod::GEMM_INTERLEAVED,
- "sgemm_12x8_1d",
+ "a64_sgemm_8x12",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, __fp16, __fp16>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, __fp16, __fp16>(args); }
},
#elif defined(__arm__)
{
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index ddb438f06c..e9e335f500 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -24,6 +24,7 @@
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
#include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
#include "gemm_interleaved_pretransposed_2d.hpp"
@@ -31,127 +32,130 @@
#include "gemv_pretransposed.hpp"
#include "kernels/a32_sgemm_8x6.hpp"
-#include "kernels/a64_hybrid_fp32_mla_16x4.hpp"
-#include "kernels/a64_hybrid_fp32_mla_4x8.hpp"
-#include "kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp"
-#include "kernels/a64_sgemm_12x8.hpp"
-#include "kernels/a64_sgemv_pretransposed.hpp"
+#include "kernels/a64_gemv_fp32_mla_32.hpp"
+#include "kernels/a64_hybrid_fp32_mla_6x16.hpp"
+#include "kernels/a64_hybrid_fp32_mla_8x4.hpp"
+#include "kernels/a64_sgemm_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp"
-#include "kernels/sve_hybrid_fp32_mla_4VLx4.hpp"
-#include "kernels/sve_hybrid_fp32_mmla_4VLx4.hpp"
-#include "kernels/sve_interleaved_fp32_mla_3VLx8.hpp"
-#include "kernels/sve_interleaved_fp32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp"
+#include "kernels/sve_gemv_fp32_mla_8VL.hpp"
+#include "kernels/sve_hybrid_fp32_mla_6x4VL.hpp"
+#include "kernels/sve_hybrid_fp32_mla_8x1VL.hpp"
+#include "kernels/sve_interleaved_fp32_mla_8x3VL.hpp"
+#include "kernels/sve_interleaved_fp32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp"
namespace arm_gemm {
static const GemmImplementation<float, float> gemm_fp32_methods[] =
{
+// GEMV cases - starting with 'gemv_batched' wrapper to turn batched GEMV into GEMM.
{
GemmMethod::GEMV_BATCHED,
"gemv_batched",
- [](const GemmArgs &args) { return (args._Msize==1) && (args._nbatches>1); },
+ [](const GemmArgs &args) { return args._Msize==1 && args._nbatches>1 && !args._indirect_input; },
nullptr,
[](const GemmArgs &args) { return new GemvBatched<float, float>(args); }
},
#ifdef __aarch64__
+#ifdef __ARM_FEATURE_SVE
{
- GemmMethod::GEMV_PRETRANSPOSED,
- "sgemv_pretransposed",
- [](const GemmArgs &args) { return (args._Msize==1 && args._nbatches==1); },
+ GemmMethod::GEMM_HYBRID,
+ "sve_gemv_fp32_mla_8VL",
+ [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemvPretransposed<sgemv_pretransposed, float, float>(args); }
+ [](const GemmArgs &args) { return new GemvPretransposed<cls_sve_gemv_fp32_mla_8VL, float, float>(args); }
},
-#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
+#endif
{
GemmMethod::GEMM_HYBRID,
- "hybrid_fp32_mmla_4VLx4",
- [](const GemmArgs &args) { return (args._Ksize >= 4); },
- [](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mmla_4VLx4, float, float>(args); }
+ "a64_gemv_fp32_mla_32",
+ [](const GemmArgs &args) { return args._Msize==1 && args._nbatches==1 && !args._indirect_input; },
+ nullptr,
+ [](const GemmArgs &args) { return new GemvPretransposed<cls_a64_gemv_fp32_mla_32, float, float>(args); }
},
+
+// MMLA next due to higher throughput (SVE only)
+#if defined(__ARM_FEATURE_SVE) && defined(MMLA_FP32)
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_fp32_mmla_3VLx8",
+ "sve_interleaved_fp32_mmla_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>4); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mmla_3VLx8, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mmla_8x3VL, float, float>(args); }
},
#endif // __ARM_FEATURE_SVE && MMLA_FP32
#ifdef __ARM_FEATURE_SVE
-// SVE smallk / hybrid methods
+// SVE smallk / hybrid methods
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_fp32_mla_1VLx8",
- [](const GemmArgs &args) { return (args._Ksize <= 24); },
+ "sve_smallK_hybrid_fp32_mla_8x1VL",
+ [](const GemmArgs &args) { return args._Ksize <= 24 && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_1VLx8, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_fp32_mla_8x1VL, float, float>(args); }
},
{
GemmMethod::GEMM_HYBRID,
- "hybrid_fp32_mla_4VLx4",
- [](const GemmArgs &args) { return (args._Ksize >= 4); },
+ "sve_hybrid_fp32_mla_8x1VL",
+ nullptr,
+ [](const GemmArgs &args) { return (args._Nsize < 12); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_8x1VL, float, float>(args); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "sve_hybrid_fp32_mla_6x4VL",
+ nullptr,
[](const GemmArgs &args) { return ((args._Ksize <= 256) && (args._Nsize <= 256)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4VLx4, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_fp32_mla_6x4VL, float, float>(args); }
},
#endif // __ARM_FEATURE_SVE
// NEON hybrid methods
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_fp32_mla_4x8",
- [](const GemmArgs &args) { return (args._Ksize <= 8) && (args._Nsize % 4)==0; },
+ "a64_smallK_hybrid_fp32_mla_8x4",
+ [](const GemmArgs &args) { return args._Ksize <= 8 && (args._Nsize % 4)==0 && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x8, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_8x4, float, float>(args); }
},
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_fp32_mla_4x6",
- [](const GemmArgs &args) { return (args._Ksize > 8) && (args._Ksize <= 16) && (args._Nsize % 4)==0; },
+ "a64_smallK_hybrid_fp32_mla_6x4",
+ [](const GemmArgs &args) { return (args._Ksize > 8 && args._Ksize <= 16) && (args._Nsize % 4)==0 && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_fp32_mla_4x6, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_fp32_mla_6x4, float, float>(args); }
},
{
GemmMethod::GEMM_HYBRID,
- "hybrid_fp32_mla_4x8_normal",
- [](const GemmArgs &args) { return (args._Ksize >= 4); },
+ "a64_hybrid_fp32_mla_8x4",
+ nullptr,
[](const GemmArgs &args) { return (args._Nsize < 12); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_4x8, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_8x4, float, float>(args); }
},
GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_HYBRID,
- "hybrid_fp32_mla_16x4",
- [](const GemmArgs &args) { return (args._Ksize >= 4); },
- [](const GemmArgs &args) { return GemmHybrid<hybrid_fp32_mla_16x4, float, float>::estimate_cycles(args, hybrid_fp32_mla_16x4::get_performance_parameters(args._ci)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_fp32_mla_16x4, float, float>(args); }
+ "a64_hybrid_fp32_mla_6x16",
+ nullptr,
+ [](const GemmArgs &args) { return GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>::estimate_cycles(args, cls_a64_hybrid_fp32_mla_6x16::get_performance_parameters(args._ci)); },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_fp32_mla_6x16, float, float>(args); }
),
-
#ifdef __ARM_FEATURE_SVE
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_fp32_mla_3VLx8",
+ "sve_interleaved_fp32_mla_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>4); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_fp32_mla_8x3VL, float, float>(args); }
},
#endif // __ARM_FEATURE_SVE
-// Pretranposed, 2D split
-GemmImplementation<float, float>::with_estimate(
- GemmMethod::GEMM_INTERLEAVED_2D,
- "sgemm_12x8_2d",
- nullptr,
- [](const GemmArgs &args) { return GemmInterleavedPretransposed2d<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
-),
-// 1D split (with pretransposed or not)
GemmImplementation<float, float>::with_estimate(
GemmMethod::GEMM_INTERLEAVED,
- "sgemm_12x8_1d",
+ "a64_sgemm_8x12",
nullptr,
- [](const GemmArgs &args) { return GemmInterleaved<sgemm_12x8, float, float>::estimate_cycles(args, sgemm_12x8::get_performance_parameters(args._ci)); },
- [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
+ [](const GemmArgs &args) { return GemmInterleaved<cls_a64_sgemm_8x12, float, float>::estimate_cycles(args, cls_a64_sgemm_8x12::get_performance_parameters(args._ci)); },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_sgemm_8x12, float, float>(args); }
),
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index 7a983ed6ac..d702cffce1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -77,51 +77,43 @@ class GemmHybrid : public GemmCommon<To, Tr> {
return args._cfg->inner_block_size;
}
- const unsigned int L1_size = args._ci->get_L1_cache_size();
+ // Target block size (512 for FP32, scaling for other types). Don't block until size reaches 1.5X this.
+ unsigned int target_block_size = 2048 / sizeof(To);
- // k_block: Find out how much of the larger array can be loaded into half the cache.
- // This should account for associative caches.
- unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+ if (args._Ksize >= ((3 * target_block_size) / 2)) {
+ unsigned int target_blocks = iceildiv(args._Ksize, target_block_size);
- // Needs to be (at least a single) multiple of the K unroll level.
- k_block /= strategy::k_unroll();
- k_block = std::max(k_block, 1U) * strategy::k_unroll();
+ unsigned int block_size = iceildiv(args._Ksize, target_blocks);
- // Now tune to presented problem size; this is how many blocks we need.
- unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+ block_size = roundup(block_size, strategy::k_unroll());
- // So divide the space equally into that many blocks.
- k_block = iceildiv(args._Ksize, numk_blocks);
-
- // And round UP to the K unroll level required.
- k_block = roundup(k_block, strategy::k_unroll());
+ return block_size;
+ }
- return k_block;
+ return args._Ksize;
}
+ // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a
+ // single block.
static unsigned int compute_n_block(const GemmArgs &args) {
if (args._cfg && args._cfg->outer_block_size) {
return args._cfg->outer_block_size;
}
- const unsigned int k_block = compute_k_block(args);
- const unsigned int L2_size = args._ci->get_L2_cache_size();
-
- // n_block: Work out how many rows (of length k_block) will fit in the L2
- // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
- unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
- (sizeof(Toi) * k_block);
+ if (args._Nsize <= 64) {
+ return args._Nsize;
+ }
- // Needs to be (at least a single) multiple of the kernel output width.
- n_block /= strategy::out_width();
- n_block = std::max(n_block, 1U) * strategy::out_width();
+ if ((args._Msize / args._Nsize) > 155) {
+ return args._Nsize;
+ }
- // And tune to the presented problem size.
- unsigned int numblocks = iceildiv(args._Nsize, n_block);
- n_block = iceildiv(args._Nsize, numblocks);
- n_block = roundup(n_block, strategy::out_width());
+ // Go slightly wider if thread count and depth are small.
+ if ((args._Ksize <= 128) && (args._maxthreads <= 16)) {
+ return strategy::out_width() * 3;
+ }
- return n_block;
+ return strategy::out_width();
}
public:
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
new file mode 100644
index 0000000000..eede1a4f76
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_indirect.hpp
@@ -0,0 +1,621 @@
+/*
+ * Copyright (c) 2017-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <alloca.h>
+
+#include <algorithm>
+#include <cassert>
+
+#include "arm_gemm.hpp"
+#include "bias_adder.hpp"
+#include "convolver.hpp"
+#include "ndrange.hpp"
+#include "performance_parameters.hpp"
+#include "transform.hpp"
+#include "utils.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#ifndef UNUSED
+#define __I_DEFINED_UNUSED
+#define UNUSED(x) ((void)(x))
+#endif
+
+namespace arm_gemm {
+
+namespace {
+
+// We need to invoke the kernel differently for quantizing and non-quantizing cases, so here is a shim class to do
+// that.
+
+template<typename OutputStage, bool SeparateQuantize = false>
+class run_hybrid_kernel {
+public:
+ template<typename strategy, typename To, typename Tr>
+ static void run (
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ const OutputStage &os, const int32_t *col_bias, unsigned int n_0 );
+};
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Nothing, false>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *bias_ptr, Activation act, bool accumulate,
+ const Nothing &, const int32_t *, unsigned int) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+ UNUSED(kern_k);
+
+ strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, bias_ptr, act, accumulate);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Requantize32, false>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+ const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+ UNUSED(kern_k);
+
+ strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, output_arg, &os, col_bias + n_0, n_0);
+}
+
+template<>
+template<typename strategy, typename To, typename Tr>
+void run_hybrid_kernel<Requantize32, true>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ const strategy &strat, unsigned int num_strings, const unsigned int *string_ptr, IndirectInputArg<To> A_arg, unsigned int M, unsigned int N,
+ unsigned int kern_k, const To *b_ptr, IndirectOutputArg<Tr> output_arg, const Tr *, Activation, bool,
+ const Requantize32 &os, const int32_t *col_bias, unsigned int n_0 ) {
+ UNUSED(kern_k);
+ // On this route we will only process one kernel height at a time and will make sure this happens in the driver loop.
+ assert(M <= strategy::out_height());
+ // We don't yet support indirect output (as the quantizer can't do it).
+ assert(output_arg.is_indirect == false);
+
+ // We need a row sum buffer and intermediate output buffer.
+ // These go on the stack as they are not too large, using an automatic array and alloca() respectively.
+ int32_t row_sums[strategy::out_height()];
+ typename strategy::result_type *result_buffer;
+
+ unsigned int output_width = roundup(N, strategy::out_width());
+
+ result_buffer = reinterpret_cast<typename strategy::result_type *>(alloca(output_width * strategy::out_height() * sizeof(typename strategy::result_type)));
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)M * kern_k * roundup(N, strategy::out_width()));
+#endif
+ // Perform the GEMM, into the output buffer.
+ strat.kernel(num_strings, string_ptr, A_arg, M, N, b_ptr, IndirectOutputArg<typename strategy::result_type>(result_buffer, output_width), nullptr, Activation(), false);
+ }
+
+ if (os.b_offset != 0) {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_ROWSUMS, (unsigned long)M * kern_k);
+#endif
+ row_sums_indirect(num_strings, string_ptr, A_arg, M, row_sums, &os);
+ } else {
+ memset(row_sums, 0, sizeof(int32_t) * strategy::out_height());
+ }
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_QUANTIZE, (unsigned long)M * N);
+#endif
+ // Quantize
+ requantize_block_32(os, N, M, result_buffer, output_width, output_arg.direct.base, output_arg.direct.stride, row_sums, col_bias + n_0, n_0);
+ }
+}
+
+} // anonymous namespace
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr, typename OutputStage = Nothing, bool SeparateQuantize = false>
+class GemmHybridIndirect : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ GemmArgs _args;
+ OutputStage _os = {};
+
+ /* Quantized support (in addition to 'output stage' above) */
+ int32_t *_col_bias = nullptr;
+
+ const unsigned int _Ktotal;
+ const unsigned int _rounded_Ksize;
+
+ /* Blocking info */
+ const unsigned int _k_block;
+ const unsigned int _n_block;
+ const unsigned int _Mround;
+
+ /* Pretransposed buffer. */
+ const Toi *_B_transposed=nullptr;
+
+ /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
+ const To * const * const * _indirect_buf = nullptr;
+
+ /* Convolver - only set up for convolution problems, so also doubles as a flag. */
+ std::unique_ptr<convolver<To>> _convolver = nullptr;
+
+ // Array of pointers to output rows
+// Tr * const * _output_ptrs;
+
+ const NDRange<4> _window_range;
+
+ unsigned int get_col_sum_size() const {
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ return _args._Nsize * _args._nmulti * sizeof(int32_t);
+ } else {
+ return 0;
+ }
+ }
+
+ static unsigned int get_ktotal(const GemmArgs &args) {
+ return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
+ }
+
+ static unsigned int compute_k_block(const GemmArgs &args) {
+ // Some kernels don't support accumulate mode - these can't do K blocking at all.
+ if (!strategy::supports_accumulate() || std::is_same<OutputStage, Requantize32>::value) {
+ return get_ktotal(args);
+ }
+
+ if (args._cfg && args._cfg->inner_block_size) {
+ return args._cfg->inner_block_size;
+ }
+
+ // Experimental data suggests an optimal block size of 512 for FP32 (scaling accordingly for other
+ // datatypes); but don't divide into blocks until we hit 1.5X this size.
+ unsigned int target_block_size = 2048 / sizeof(To);
+ auto ktotal = get_ktotal(args);
+
+ if (ktotal > ((target_block_size*3)/2)) {
+ unsigned int target_blocks = iceildiv(ktotal, target_block_size);
+
+ unsigned int block_size = iceildiv(ktotal, target_blocks);
+
+ block_size = roundup(block_size, strategy::k_unroll());
+
+ return block_size;
+ }
+
+ return ktotal;
+ }
+
+ // New N blocking strategy: if it's narrow, or much taller than it is wide, do the full width. Otherwise do a
+ // single block.
+ static unsigned int compute_n_block(const GemmArgs &args, const OutputStage os = {}) {
+ if (args._cfg && args._cfg->outer_block_size) {
+ return args._cfg->outer_block_size;
+ }
+
+ if (args._Nsize <= 64) {
+ return args._Nsize;
+ }
+
+ if ((args._Msize / args._Nsize) > 155) {
+ return args._Nsize;
+ }
+
+ // "Asymmetric" quantizing GEMMs require a different approach - the tall skinny blocks we would otherwise
+ // use imply a great deal of repeated work performing the row sums. If row sums are involved, work out how
+ // much "column" parallelism is going to be required and set the block size accordingly.
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&os);
+
+ // Row sums only needed if b_offset isn't 0
+ if (qp->b_offset != 0) {
+ // We can already parallelize across batches, multis and rows (in units of 'out_height')
+ int multi_row_parallelism = args._nmulti * args._nbatches * iceildiv(args._Msize, strategy::out_height());
+
+ // If this isn't enough, we will need to split up the columns too.
+ if (multi_row_parallelism < args._maxthreads) {
+ unsigned int columns_needed = iceildiv(args._maxthreads, multi_row_parallelism);
+
+ unsigned int n_block = iceildiv(args._Nsize, columns_needed);
+
+ return roundup(n_block, strategy::out_width());
+ }
+
+ // Multi/Batch/Row parallelism is enough - don't split up the columns.
+ return args._Nsize;
+ }
+ }
+
+ if (args._Ksize <= 128 && args._maxthreads <= 16) {
+ return strategy::out_width() * 3;
+ }
+
+ return strategy::out_width();
+ }
+
+public:
+ GemmHybridIndirect(GemmHybridIndirect &) = delete;
+ GemmHybridIndirect & operator= (GemmHybridIndirect &) = delete;
+
+ /* Constructor */
+ GemmHybridIndirect(const GemmArgs &args, const OutputStage &os)
+ : _args(args), _os(os), _Ktotal(get_ktotal(args)),
+ _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
+ _k_block(compute_k_block(args)), _n_block(compute_n_block(args, os)),
+ _Mround(roundup(args._Msize, strategy::out_height())),
+ _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
+ iceildiv(args._Nsize, _n_block), args._nmulti)
+ {
+ // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
+ // GemmConfig. Clear out the pointer to avoid accidents.
+ _args._cfg = nullptr;
+ }
+
+ /* Constructor without OutputStage */
+ GemmHybridIndirect(const GemmArgs &args)
+ : _args(args), _Ktotal(get_ktotal(args)),
+ _rounded_Ksize(roundup(args._Ksize, strategy::k_unroll())),
+ _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+ _Mround(roundup(args._Msize, strategy::out_height())),
+ _window_range(iceildiv(args._Msize, strategy::out_height()), args._nbatches,
+ iceildiv(args._Nsize, _n_block), args._nmulti)
+ {
+ // We take a copy of the arguments (not a pointer or reference), but there is no lifetime requirement on the
+ // GemmConfig. Clear out the pointer to avoid accidents.
+ _args._cfg = nullptr;
+ }
+
+ // Interface implementation - Compulsory functions
+ ndrange_t get_window_size() const override {
+ return { _window_range.total_size() };
+ }
+
+ // This kernel can always be dynamically scheduled.
+ bool supports_dynamic_scheduling() const override {
+ return true;
+ }
+
+ // Execute
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+ strategy strat(_args._ci);
+
+ std::vector<const To *> in_row_ptrs;
+ std::vector<const To * const *> in_row_strings;
+ std::vector<unsigned int> string_lengths;
+
+ // In convolution mode, we need input pointers.
+ if (_convolver) {
+ in_row_ptrs = std::vector<const To *>(strategy::out_height() * _args._Ksections, nullptr);
+ in_row_strings = std::vector<const To * const *>(_args._Ksections, nullptr);
+
+ for (unsigned int i=0; i<_args._Ksections; i++) {
+ in_row_strings[i] = &(in_row_ptrs[i * strategy::out_height()]);
+ }
+ }
+
+ // In any indirect mode, we need the string lengths.
+ if (_args._indirect_input) {
+ string_lengths = std::vector<unsigned int>(_args._Ksections, 0);
+ }
+
+ /* Make sure we've been set up correctly. */
+ assert(_B_transposed);
+ static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+// static_assert(std::is_same<Tr, Tri>::value, "gemm_native: Result types must be the same.");
+
+ /* For now, each work item implies all the K for a given output
+ * pixel (so we don't need to synchronize access to the output
+ * array). So separate the loop over K blocks here. */
+ for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+ unsigned int kmax = std::min(k0 + _k_block, _Ktotal);
+ unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+ const bool first_pass = (k0 == 0);
+ const bool last_pass = (kmax == _Ktotal);
+
+ unsigned int first_section = (k0 / _rounded_Ksize);
+ unsigned int first_offset = (k0 % _rounded_Ksize);
+ unsigned int kleft = kern_k;
+ unsigned int sections=0;
+ unsigned int offset = first_offset;
+
+ if (_args._indirect_input) {
+ while (kleft) {
+ // When chopping into sections: the amount that goes into 'string_lengths' is the amount to be
+ // processed (excluding padding). But the amount we subtract from 'kleft' takes account of any
+ // padding applied.
+ string_lengths[sections] = std::min(kleft, _args._Ksize - offset);
+ kleft -= std::min(kleft, _rounded_Ksize - offset);
+ sections++;
+ offset=0;
+ }
+ }
+
+ auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
+
+ if (p.done()) {
+ return;
+ }
+
+ // Process rows either 'out_height' rows at a time, or do all valid rows at once with a single kernel call.
+ // The separate quantizer path only handles one block of rows at a time (as it has to store sums and intermediate results).
+ // THe convolution path only generates the pointers for one block of rows at a time.
+ const bool process_all_rows = (!SeparateQuantize && !_convolver);
+
+ do {
+ const unsigned int m_start = p.dim(0) * strategy::out_height();
+ const unsigned int m_end = process_all_rows ? std::min(p.dim0_max() * strategy::out_height(), _args._Msize) : std::min(m_start + strategy::out_height(), _args._Msize);
+// const unsigned int m_end = std::min(m_start + strategy::out_height(), _args._Msize);
+ const unsigned int batch = p.dim(1);
+ const unsigned int n0 = p.dim(2) * _n_block;
+ const unsigned int nmax = std::min(n0 + _n_block, _args._Nsize);
+ const unsigned int multi = p.dim(3);
+
+ const Toi *b_panel = _B_transposed +
+ (multi * roundup(_args._Nsize, strategy::out_width()) * _Ktotal) +
+ (k0 * roundup(_args._Nsize, strategy::out_width())) +
+ (n0 * kern_k);
+
+ IndirectOutputArg<Tr> out_arg(this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc);
+
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+ if (_indirect_buf) {
+ run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+ prof,
+#endif
+ strat, sections, string_lengths.data(),
+ IndirectInputArg<To>(_indirect_buf + (multi * _args._nbatches * _args._Ksections) + (batch * _args._Ksections) + first_section, m_start, first_offset),
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+ last_pass ? _args._act : Activation(),
+ !first_pass,
+ // Quantization parameters
+ _os, _col_bias+(multi * _args._Nsize), n0);
+ } else if (_convolver) {
+ auto conv_cols = _convolver->process_columns(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride), this->_lda, k0, kmax, _rounded_Ksize);
+
+ unsigned int pos=0;
+ auto conv_rows = conv_cols.process_rows(m_start, m_end - m_start);
+
+ while (!conv_rows.finished()) {
+ unsigned int width, conv_offset;
+
+ assert(pos < sections);
+
+ std::tie(width, conv_offset) = conv_rows.next_block(&(in_row_ptrs[pos * strategy::out_height()]));
+
+ if (pos==0) {
+ assert(conv_offset == first_offset);
+ }
+ assert(width == string_lengths[pos]);
+ pos++;
+ }
+ assert(pos == sections);
+
+ run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+ prof,
+#endif
+ strat, sections, string_lengths.data(),
+ IndirectInputArg<To>(in_row_strings.data(), 0, first_offset),
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+ last_pass ? _args._act : Activation(),
+ !first_pass,
+ // Quantization parameters
+ _os, _col_bias+(multi * _args._Nsize), n0);
+ } else {
+ // Length to process. This needs to exclude padding, but 'kmax' potentially includes it.
+ const unsigned int len = (std::min(_args._Ksize, kmax) - k0);
+
+ run_hybrid_kernel<OutputStage, SeparateQuantize>::run(
+#ifdef CYCLE_PROFILING
+ prof,
+#endif
+ strat, 1, &len,
+ IndirectInputArg<To>(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + m_start * this->_lda + k0, this->_lda),
+ (m_end - m_start), (nmax - n0), kern_k, b_panel, out_arg,
+ (this->_bias && first_pass) ? this->_bias + (multi * this->_bias_multi_stride) + n0 : nullptr,
+ last_pass ? _args._act : Activation(),
+ !first_pass,
+ // Quantization parameters
+ _os, _col_bias+(multi * _args._Nsize), n0);
+ }
+ } while (process_all_rows ? p.next_dim1() : p.next_dim0());
+ }
+ }
+
+ // Interface implementation - pretransposed
+ bool B_is_pretransposed() const override {
+ return true;
+ }
+
+ bool B_pretranspose_required() const override {
+ return (_B_transposed==nullptr);
+ }
+
+ size_t get_B_pretransposed_array_size() const override {
+ // Start with actual pretransposed buffer...
+ size_t size = roundup(_args._Nsize, strategy::out_width()) * _Ktotal * _args._nmulti * sizeof(Toi);
+
+ // Space for result row pointers (not strictly needed any more but retained for indirect output testing)
+ size += _args._Msize * _args._nbatches * _args._nmulti * sizeof(const Tr *);
+
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ size += get_col_sum_size();
+ }
+
+ return size;
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ _col_bias = reinterpret_cast<int32_t *>(in_buffer);
+
+ Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
+
+ for (unsigned int i=0; i<_args._nmulti; i++) {
+ // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
+ compute_col_sums(*qp_ptr, _args._Nsize, _args._Ksize * _args._Ksections, B + (i * B_multi_stride), ldb, _col_bias + (i * _args._Nsize), _args._Ksize * _args._Ksections, i, 0);
+ }
+ }
+
+ // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+ uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+ Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+ _B_transposed = buffer;
+
+ strategy strat(_args._ci);
+
+ for (unsigned int multi=0; multi<_args._nmulti; multi++) {
+ for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+ const unsigned int kmax=std::min(k0 + _k_block, _Ktotal);
+
+ /* Figure out the size of each block. */
+ unsigned int k_size = kmax - k0;
+
+ // We need to insert padding at the end of each K section.
+ // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+ // terms of the full, padded, _Ktotal.
+ // But we need to transform each section with reference to the original, unpadded, input, letting the
+ // transform pad each section as needed.
+
+ // This is needed for computations below.
+ const unsigned int rounded_section_size = roundup(_args._Ksize, strategy::k_unroll());
+
+ // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+ // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
+ // a time.
+ for (unsigned int x0=0; x0 < _args._Nsize; x0 += strategy::out_width() ){
+ unsigned int xmax = std::min(x0 + strategy::out_width(), _args._Nsize);
+
+ // Track where we are and how much work is left.
+ unsigned int kpos = k0;
+ unsigned int kleft = k_size;
+
+ while (kleft) {
+ // Which section are we in? Based on the rounded-up section size.
+ unsigned int k_section_base = kpos / rounded_section_size;
+ // How far into the section are we?
+ unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+ // We will either copy the rest of this section, or to the end of the requested length.
+ unsigned int k_length = std::min(_args._Ksize - k_offset, kleft);
+
+ strat.transforms.PrepareB(buffer, B + (multi * B_multi_stride), ldb,
+ x0, xmax,
+ (k_section_base * _args._Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
+ (k_section_base * _args._Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
+
+ // We need to modify our position based on the ROUNDED version of what we just did.
+ unsigned int padded_length = roundup(k_length, strategy::k_unroll());
+
+ buffer += strategy::out_width() * padded_length;
+
+ kpos += padded_length;
+ kleft -= padded_length;
+ }
+ }
+ }
+ }
+ }
+
+ void set_pretransposed_B_data(void *in_buffer) override {
+ // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+ uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+ _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+ _col_bias = reinterpret_cast<int32_t *>(in_buffer);
+ }
+
+ // Estimate cycles for given problem given provided parameters
+ static uint64_t estimate_cycles(const GemmArgs &args, const PerformanceParameters &params) {
+ // Note: Current hybrid kernels don't actually round up height (they
+ // have paths for each possible height). Might need to make this
+ // configurable in future.
+ uint64_t total_macs = static_cast<uint64_t>(args._nbatches) * args._nmulti * args._Msize * roundup(args._Nsize, strategy::out_width()) * roundup(args._Ksize, strategy::k_unroll());
+
+ float mac_cycles = static_cast<float>(total_macs) / params.kernel_macs_cycle;
+
+ // TODO: A bit of a kludge here: current hybrid kernels incur extra
+ // overhead where the width is not a multiple of kernel width. It's
+ // most noticable where the overall width is quite low, so add 15%
+ // penalty for such widths.
+ if ((args._Nsize < strategy::out_width()) || (args._Nsize > strategy::out_width() && args._Nsize < 2*strategy::out_width())) {
+ mac_cycles *= 1.15f;
+ }
+
+ uint64_t total_cycles = mac_cycles;
+
+ return total_cycles;
+ }
+
+ void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
+
+ qp->bias = bias;
+ qp->bias_multi_stride = bias_multi_stride;
+ }
+ }
+
+ void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
+ assert(string_len == _args._Ksize);
+ _indirect_buf = ptr;
+ }
+
+ void set_convolution_parameters(ConvolutionParameters parms) override {
+ assert(parms.input_channels == _args._Ksize);
+ _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
+ }
+};
+
+} // namespace arm_gemm
+
+#ifdef __I_DEFINED_UNUSED
+#undef UNUSED
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 915227fc29..7a5fa87ee6 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -118,18 +118,27 @@ class GemmHybridQuantized : public GemmCommon<To, Tr> {
// n_block: Work out how many rows (of length k_block) will fit in the L2
// Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
- unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
- (sizeof(Toi) * k_block);
+ const unsigned int scaled_l2_size = (L2_size * 9) / 10;
+ const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
+
+ // .. if the L1 contents is bigger than the L2, just return a minimal size block.
+ if (k_block_area > scaled_l2_size) {
+ return strategy::out_width();
+ }
+
+ unsigned int n_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
// Needs to be (at least a single) multiple of the kernel output width.
n_block /= strategy::out_width();
- n_block = std::max(n_block, 1U) * strategy::out_width();
+ n_block = std::max(n_block, 1u) * strategy::out_width();
// And tune to the presented problem size.
unsigned int numblocks = iceildiv(args._Nsize, n_block);
n_block = iceildiv(args._Nsize, numblocks);
n_block = roundup(n_block, strategy::out_width());
+ assert(n_block > 0);
+
return n_block;
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
new file mode 100644
index 0000000000..7376b5ffe3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized_inline.hpp
@@ -0,0 +1,265 @@
+/*
+ * Copyright (c) 2017-2019 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <assert.h>
+
+#include <algorithm>
+
+#include "arm_gemm.hpp"
+#include "ndrange.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+namespace arm_gemm {
+
+// Implementation of the GemmCommon abstract class.
+template<typename strategy, typename To, typename Tr>
+class GemmHybridQuantizedInline : public GemmCommon<To, Tr> {
+ typedef typename strategy::operand_type Toi;
+ typedef typename strategy::result_type Tri;
+
+ /* const properties set by constructor */
+ const CPUInfo * const _ci;
+
+ const unsigned int _Msize;
+ const unsigned int _Nsize;
+ const unsigned int _Ksize;
+
+ const unsigned int _nbatches;
+ const unsigned int _nmulti;
+
+ /* Blocking info */
+ const unsigned int _k_block;
+ const unsigned int _n_block;
+ const unsigned int _Mround;
+
+ /* Pretransposed buffer. */
+ const Toi *_B_transposed=nullptr;
+
+ const NDRange<4> _window_range;
+
+ Requantize32 _qp;
+ int32_t *col_bias = nullptr;
+
+ void *working_space = nullptr;
+
+ unsigned int _nthreads;
+
+ unsigned int get_col_sum_size() const {
+ return _Nsize * _nmulti * sizeof(int32_t);
+ }
+
+ static unsigned int compute_k_block(const GemmArgs &args) {
+ // We don't support K blocks as we only temporarily store 32 bit results.
+ return args._Ksize;
+
+ if (args._cfg && args._cfg->inner_block_size) {
+ return args._cfg->inner_block_size;
+ }
+
+ const unsigned int L1_size = args._ci->get_L1_cache_size();
+
+ // k_block: Find out how much of the larger array can be loaded into half the cache.
+ // This should account for associative caches.
+ unsigned int k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+ // Needs to be (at least a single) multiple of the K unroll level.
+ k_block /= strategy::k_unroll();
+ k_block = std::max(k_block, 1U) * strategy::k_unroll();
+
+ // Now tune to presented problem size; this is how many blocks we need.
+ unsigned int numk_blocks = iceildiv(args._Ksize, k_block);
+
+ // So divide the space equally into that many blocks.
+ k_block = iceildiv(args._Ksize, numk_blocks);
+
+ // And round UP to the K unroll level required.
+ k_block = roundup(k_block, strategy::k_unroll());
+
+ return k_block;
+ }
+
+ static unsigned int compute_n_block(const GemmArgs &args) {
+ if (args._cfg && args._cfg->outer_block_size) {
+ return args._cfg->outer_block_size;
+ }
+
+ const unsigned int k_block = compute_k_block(args);
+ const unsigned int L2_size = args._ci->get_L2_cache_size();
+
+ // n_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ unsigned int n_block = (((L2_size * 9) / 10) - (k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+ (sizeof(Toi) * k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ n_block /= strategy::out_width();
+ n_block = std::max(n_block, 1U) * strategy::out_width();
+
+ // And tune to the presented problem size.
+ unsigned int numblocks = iceildiv(args._Nsize, n_block);
+ n_block = iceildiv(args._Nsize, numblocks);
+ n_block = roundup(n_block, strategy::out_width());
+
+ return n_block;
+ }
+
+public:
+ GemmHybridQuantizedInline(GemmHybridQuantizedInline &) = delete;
+ GemmHybridQuantizedInline & operator= (GemmHybridQuantizedInline &) = delete;
+
+ /* Constructor */
+ GemmHybridQuantizedInline(const GemmArgs &args, const Requantize32 &qp)
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _nbatches(args._nbatches), _nmulti(args._nmulti),
+ _k_block(compute_k_block(args)), _n_block(compute_n_block(args)),
+ _Mround(roundup(args._Msize, strategy::out_height())),
+ _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti),
+ _qp (qp), _nthreads(args._maxthreads) { }
+
+ // Interface implementation - Compulsory functions
+ ndrange_t get_window_size() const override {
+ return { _window_range.total_size() };
+ }
+
+ // This kernel can always be dynamically scheduled.
+ bool supports_dynamic_scheduling() const override {
+ return true;
+ }
+
+ // Execute
+ void execute(const ndcoord_t &work_range, const ndcoord_t &, int) override {
+#ifdef CYCLE_PROFILING
+ profiler prof;
+#endif
+ strategy strat(_ci);
+
+ /* Make sure we've been set up correctly. */
+ assert(_B_transposed);
+ static_assert(std::is_same<To, Toi>::value, "gemm_native: Operand types must be the same.");
+
+ /* For now, each work item implies all the K for a given output
+ * pixel (so we don't need to synchronize access to the output
+ * array). So separate the loop over K blocks here. */
+ for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+ unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+ unsigned int kern_k = roundup(kmax-k0, strategy::k_unroll());
+
+ auto p = _window_range.iterator(work_range.get_position(0), work_range.get_position_end(0));
+
+ if (p.done()) {
+ return;
+ }
+
+ do {
+ const unsigned int m_start = p.dim(0) * strategy::out_height();
+ const unsigned int m_end = std::min(p.dim0_max() * strategy::out_height(), _Msize);
+ const unsigned int batch = p.dim(1);
+ const unsigned int n0 = p.dim(2) * _n_block;
+ const unsigned int nmax = std::min(n0 + _n_block, _Nsize);
+ const unsigned int multi = p.dim(3);
+
+ const Toi *b_panel = _B_transposed +
+ (multi * roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll())) +
+ (k0 * roundup(_Nsize, strategy::out_width())) +
+ (n0 * kern_k);
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (m_end - m_start) * kern_k * roundup(nmax-n0, strategy::out_width()));
+#endif
+ strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + (batch * this->_A_batch_stride) + (m_start * this->_lda) + k0, this->_lda,
+ b_panel,
+ this->_Cptr + (multi * this->_C_multi_stride) + (batch * this->_C_batch_stride) + (m_start * this->_ldc) + n0, this->_ldc,
+ (m_end - m_start), (nmax - n0), kmax - k0,
+ col_bias + (multi * _Nsize) + n0, _qp);
+ }
+ } while (p.next_dim1());
+ }
+ }
+
+ // Interface implementation - pretransposed
+ bool B_is_pretransposed() const override {
+ return true;
+ }
+
+ bool B_pretranspose_required() const override {
+ return (_B_transposed==nullptr);
+ }
+
+ size_t get_B_pretransposed_array_size() const override {
+ return get_col_sum_size() + (roundup(_Nsize, strategy::out_width()) * roundup(_Ksize, strategy::k_unroll()) * _nmulti * sizeof(Toi));
+ }
+
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+ col_bias = reinterpret_cast<int32_t *>(in_buffer);
+
+ for (unsigned int i=0; i<_nmulti; i++) {
+ compute_col_sums(_qp, _Nsize, _Ksize, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize, i, 0);
+ }
+
+ uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+ Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+ _B_transposed = buffer;
+ strategy strat(_ci);
+
+ for (unsigned int multi=0; multi<_nmulti; multi++) {
+ for (unsigned int k0=0; k0<_Ksize; k0+=_k_block) {
+ const unsigned int kmax = std::min(k0 + _k_block, _Ksize);
+ const unsigned int k_size = roundup(kmax-k0, strategy::k_unroll());
+
+ for (unsigned int x0=0; x0<_Nsize; x0+=_n_block) {
+ const unsigned int xmax = std::min(x0+_n_block, _Nsize);
+
+ const unsigned int size = roundup(xmax-x0, strategy::out_width()) * k_size;
+
+ strat.transforms.PrepareB( buffer, B + (multi * B_multi_stride), ldb,
+ x0, xmax, k0, kmax);
+
+ buffer += size;
+ }
+ }
+ }
+ }
+
+ void set_pretransposed_B_data(void *in_buffer) override {
+ uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+ _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+ col_bias = reinterpret_cast<int32_t *>(in_buffer);
+ }
+
+ void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+ _qp.bias = bias;
+ _qp.bias_multi_stride = bias_multi_stride;
+ }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
index 261e7d2d9c..f6a0fc5d52 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_implementation.hpp
@@ -37,9 +37,9 @@ template<typename Top, typename Tret, class OutputStage = Nothing>
struct GemmImplementation {
const GemmMethod method;
const char * name;
- std::function<bool(const GemmArgs &, const OutputStage &)> is_supported;
- std::function<uint64_t(const GemmArgs &, const OutputStage &)> cycle_estimate;
- std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate;
+ std::function<bool(const GemmArgs &, const OutputStage &)> is_supported = {};
+ std::function<uint64_t(const GemmArgs &, const OutputStage &)> cycle_estimate = {};
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate = {};
bool do_is_supported(const GemmArgs &args, const OutputStage &os) const {
if (is_supported != nullptr) {
@@ -57,13 +57,13 @@ struct GemmImplementation {
}
}
- GemmImplementation(const GemmImplementation &) = default;
- GemmImplementation &operator= (const GemmImplementation &) = default;
-
GemmCommon<Top, Tret> *do_instantiate(const GemmArgs &args, const OutputStage &os) const {
return instantiate(args, os);
}
+ GemmImplementation(const GemmImplementation &) = default;
+ GemmImplementation & operator= (const GemmImplementation &) = default;
+
GemmImplementation(GemmMethod m, const char *n,
std::function<bool(const GemmArgs &, const OutputStage &)> is_supported, std::function<bool(const GemmArgs &, const OutputStage &)> is_recommended,
std::function<GemmCommon<Top, Tret> *(const GemmArgs &, const OutputStage &)> instantiate) :
@@ -79,9 +79,9 @@ template<typename Top, typename Tret>
struct GemmImplementation<Top, Tret, Nothing> {
const GemmMethod method;
const char * name;
- std::function<bool(const GemmArgs &)> is_supported;
- std::function<uint64_t(const GemmArgs &)> cycle_estimate;
- std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate;
+ std::function<bool(const GemmArgs &)> is_supported = {};
+ std::function<uint64_t(const GemmArgs &)> cycle_estimate = {};
+ std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate = {};
bool do_is_supported(const GemmArgs &args, const Nothing &) const {
if (is_supported != nullptr) {
@@ -103,7 +103,6 @@ struct GemmImplementation<Top, Tret, Nothing> {
return instantiate(args);
}
-
static GemmImplementation with_estimate(GemmMethod m, const char *n,
std::function<bool(const GemmArgs &)> is_supported, std::function<uint64_t(const GemmArgs &)> cycle_estimate,
std::function<GemmCommon<Top, Tret> *(const GemmArgs &)> instantiate) {
@@ -116,7 +115,10 @@ struct GemmImplementation<Top, Tret, Nothing> {
return impl;
}
- GemmImplementation(GemmMethod m, const char * n) : method(m), name(n), is_supported(nullptr), cycle_estimate(nullptr), instantiate(nullptr) {}
+ GemmImplementation(const GemmImplementation &) = default;
+ GemmImplementation & operator= (const GemmImplementation &) = default;
+
+ GemmImplementation(GemmMethod m, const char * n) : method(m), name(n) {}
GemmImplementation(GemmMethod m, const char *n,
std::function<bool(const GemmArgs &)> is_supported, std::function<bool(const GemmArgs &)> is_recommended,
@@ -124,9 +126,6 @@ struct GemmImplementation<Top, Tret, Nothing> {
method(m), name(n), is_supported(is_supported),
cycle_estimate( [is_recommended](const GemmArgs &args) -> uint64_t { return (is_recommended == nullptr) ? 0 : (is_recommended(args) ? 0 : UINT64_MAX); } ),
instantiate(instantiate) { }
-
- GemmImplementation(const GemmImplementation &) = default;
- GemmImplementation &operator=(const GemmImplementation &) = default;
};
/* "Master" function implemented for each valid combination of types.
@@ -211,6 +210,7 @@ std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, cons
for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
/* Check that this implementation supports the presented problem. */
+
if (!i->do_is_supported(args, os)) {
continue;
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
index da682330a0..a3a61959c3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int16.cpp
@@ -28,17 +28,17 @@
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
-#include "kernels/a64_gemm_s16_12x8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
namespace arm_gemm {
static const GemmImplementation<int16_t, int32_t> gemm_s16_methods[] = {
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_s16_12x8",
+ "a64_gemm_s16_8x12",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int16_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int16_t, int32_t>(args); }
},
{
GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
index 147caeefbd..31f225002e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_int8.cpp
@@ -26,21 +26,22 @@
#include "arm_gemm.hpp"
#include "gemm_common.hpp"
#include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
-#include "gemm_interleaved_pretransposed_2d.hpp"
-#include "kernels/a64_gemm_s16_12x8.hpp"
-#include "kernels/a64_gemm_s8_12x8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
+#include "kernels/a64_gemm_s8_8x12.hpp"
#include "kernels/a64_gemm_s8_4x4.hpp"
-#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
-#include "kernels/a64_interleaved_s8s32_mmla_12x8.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
-#include "kernels/sve_interleaved_s8s32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
+
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
namespace arm_gemm {
@@ -49,106 +50,84 @@ static const GemmImplementation<int8_t, int32_t> gemm_s8_methods[] = {
#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_s8s32_mmla_3VLx8",
+ "sve_interleaved_s8s32_mmla_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>8); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_mmla_3VLx8, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int32_t>(args); }
},
#endif
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_s8s32_dot_1VLx8",
- [](const GemmArgs &args) { return args._Ksize<=64; },
+ "sve_smallK_hybrid_s8s32_dot_8x1VL",
+ [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int32_t>(args); }
},
{
GemmMethod::GEMM_HYBRID,
- "hybrid_s8s32_dot_4VLx4",
+ "sve_hybrid_s8s32_dot_6x4VL",
[](const GemmArgs &args) { return args._Ksize>=16; },
[](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_4VLx4, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_s8s32_dot_3VLx8",
+ "sve_interleaved_s8s32_dot_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>4); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_dot_3VLx8, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int32_t>(args); }
},
-#endif
+#endif // SVE
#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_s8s32_mmla_12x8",
+ "a64_interleaved_s8s32_mmla_8x12",
[](const GemmArgs &args) { return (args._Ksize>8); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_s8s32_mmla_12x8, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int32_t>(args); }
},
#endif
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_s8s32_dot_4x8",
- [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+ "a64_smallK_hybrid_s8s32_dot_8x4",
+ [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x8, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int32_t>(args); }
},
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_s8s32_dot_4x6",
- [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+ "a64_smallK_hybrid_s8s32_dot_6x4",
+ [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_s8s32_dot_4x6, int8_t, int32_t>(args); }
-},
-{
- GemmMethod::GEMM_HYBRID,
- "hybrid_s8s32_dot_16x4",
- [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
- [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_s8s32_dot_16x4, int8_t, int32_t>(args); }
-},
-{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "gemm_s8_12x8_2d",
- [](const GemmArgs &args) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8); },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_12x8, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_s8_12x8_1d",
- [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+ "a64_gemm_s16_8x12",
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_12x8, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Ksize>4; },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s16_8x12, int8_t, int32_t>(args); },
},
{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "gemm_s16_12x8_2d",
- nullptr,
- [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4 && (args._Msize / args._maxthreads) < 8; },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s16_12x8, int8_t, int32_t>(args); },
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_s8s32_dot_6x16",
+ [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+ [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_s16_12x8_1d",
- nullptr,
- [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_s16_12x8, int8_t, int32_t>(args); },
-},
-{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "gemm_s8_4x4_2d",
+ "a64_gemm_s8_8x12",
+ [](const GemmArgs &args) { return args._ci->has_dotprod(); },
nullptr,
- [](const GemmArgs &args) { return ((args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8)) ||
- ((args._Msize / args._maxthreads) < 4); },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_s8_4x4, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_8x12, int8_t, int32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_s8_4x4_1d",
+ "a64_gemm_s8_4x4",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_s8_4x4, int8_t, int32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_s8_4x4, int8_t, int32_t>(args); }
},
{
GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index c4dceef922..92c1086a5f 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -27,11 +27,12 @@
#include <cassert>
#include "arm_gemm.hpp"
-#include "utils.hpp"
-
+#include "convolver.hpp"
#include "mergeresults.hpp"
#include "performance_parameters.hpp"
+#include "quantized.hpp"
#include "transform.hpp"
+#include "utils.hpp"
#ifdef CYCLE_PROFILING
#include "profiler.hpp"
@@ -46,12 +47,212 @@
//
// This implementation interleaves the source matrices in blocks - good for
// larger matrices.
+
namespace arm_gemm {
-template<typename strategy, typename To, typename Tr>
+namespace {
+
+// Some kernels output to a linear buffer and require a separate merge step.
+// Others output directly to the matrix result. This helper class calls the
+// appropriate functions, using templating to avoid calling non-existent
+// functions.
+template<bool MergeStep, typename OutputStage>
+class kernel_and_merge {
+public:
+ template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+ static void run (
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+ Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+ unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+ const Activation &act, bool accumulate, const OutputStage &os, const int32_t *col_bias,
+ Tab *acc_buff);
+};
+
+// Run a kernel and call the separate merge step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, Nothing>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+ Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+ unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+ const Activation &act, bool accumulate, const Nothing &, const int32_t *, Tab *)
+{
+ const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+ strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+ }
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+ strat.transforms.Merge(c_ptr, c_panel, ldc, m_0, m_max, n_0, n_max, biasptr, act, accumulate);
+ }
+}
+
+// Run a kernel with integrated merge
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, Nothing>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+ Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+ unsigned int n_0, unsigned int n_max, const Tr *biasptr,
+ const Activation &act, bool accumulate, const Nothing &, const int32_t *,
+ Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+ // We need to offset the C pointer, but as it might be NULL (requesting output to accumulation buffer) we need
+ // to be careful not to offset a null pointer.
+ Tri *offset_c_ptr;
+
+ if (c_ptr == nullptr) {
+ offset_c_ptr = nullptr;
+ } else {
+ offset_c_ptr = c_ptr + m_0 * ldc + n_0;
+ }
+
+ strat.kernel(// A and B pointers are just the packed panels.
+ a_ptr, b_panel,
+ // Provide relevant part of output array and row stride.
+ offset_c_ptr, ldc,
+ // M, N, K sizes
+ m_max-m_0, n_max - n_0, kern_k,
+ // Bias, activation, accumulation. Need to offset the bias as needed.
+ biasptr ? biasptr + n_0 : nullptr, act, accumulate,
+ // Accumulation buffer.
+ acc_buff );
+}
+
+// Run a kernel with integrated merge, quantizing
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<false, Requantize32>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ strategy &strat, const To *a_ptr, const To *b_panel, Tri *,
+ Tr *c_ptr, int ldc, int kern_k, unsigned int m_0, unsigned int m_max,
+ unsigned int n_0, unsigned int n_max, const Tr *,
+ const Activation &, bool accumulate, const Requantize32 &qp, const int32_t *col_bias,
+ Tab *acc_buff)
+{
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_KERNEL, (m_max - m_0) * (n_max - n_0) * kern_k);
+#endif
+
+ strat.kernel(// A and B pointers are just the packed panels.
+ a_ptr, b_panel,
+ // Provide relevant part of output array and row stride.
+ c_ptr + m_0 * ldc + n_0, ldc,
+ // M, N, K sizes
+ m_max-m_0, n_max - n_0, kern_k,
+ // Bias, activation, accumulation. Need to offset the bias as needed.
+ col_bias + n_0, qp, n_0, accumulate, acc_buff);
+}
+
+// Run a kernel and call the separate quantize step
+template<>
+template<typename strategy, typename To, typename Tr, typename Tri, typename Tab>
+void kernel_and_merge<true, Requantize32>::run(
+#ifdef CYCLE_PROFILING
+ profiler &prof,
+#endif
+ strategy &strat, const To *a_ptr, const To *b_panel, Tri *c_panel,
+ Tr *c_ptr, int ldc, int kern_k, unsigned int m_0,
+ unsigned int m_max, unsigned int n_0, unsigned int n_max, const Tr *,
+ const Activation &, bool, const Requantize32 &qp, const int32_t *col_bias,
+ Tab *)
+{
+ const int bblocks = iceildiv(n_max - n_0, strategy::out_width());
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+#endif
+
+ strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+ }
+
+ {
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_QUANTIZE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
+#endif
+ // The interleaved kernel outputs in blocks - each block is a
+ // row-major matrix of size out_width * out_height. The merge
+ // kernels are designed to deal with this but the requantizer is
+ // not, so we need to requantize one block at a time.
+ for (int i=0; i<bblocks; i++) {
+ unsigned int n_start = n_0 + (strategy::out_width() * i);
+ unsigned int n_end = std::min(n_start + strategy::out_width(), n_max);
+
+ // The row bias is interleaved with the transposed A data, get a pointer to it here.
+ const int32_t *row_bias = reinterpret_cast<const int32_t *>(a_ptr + strategy::out_height() * kern_k);
+
+ requantize_block_32(qp, (n_end - n_start), (m_max-m_0),
+ c_panel + (i * strategy::out_width() * strategy::out_height()), strategy::out_width(),
+ c_ptr + m_0 * ldc + n_start, ldc,
+ row_bias, col_bias + n_start, n_start);
+ }
+ }
+}
+
+// Integer GEMMs can be used in two contexts - "normal" where the full 32-bit output is required, or in
+// "requantizing" context where the output will be requantized.
+//
+// These require different input transforms, as if we are requantizing we want to sum the rows of the A input, and
+// if we are not we don't.
+//
+// This helper class allows the appropriate transforms to be found, without requiring kernels that don't support
+// quantization to define useless "quantized" transforms.
+template<typename strategy, bool quantized>
+class transform_type {
+public:
+ typedef decltype(strategy::transforms) type;
+};
+
+template<typename strategy>
+class transform_type<strategy, true> {
+public:
+ typedef decltype(strategy::transforms_quantized) type;
+};
+
+// We need a similar trick here to figure out what type the accumulator buffer should be.
+template<typename strategy, typename OutputStage>
+class accumulate_buffer_type {
+public:
+ typedef typename strategy::result_type type;
+};
+
+template<typename strategy>
+class accumulate_buffer_type<strategy, Requantize32> {
+public:
+ typedef int32_t type;
+};
+
+} // anonymous namespace
+
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing, bool MergeStep=true, bool ForceThreadColumns=false>
class GemmInterleaved : public GemmCommon<To, Tr> {
typedef typename strategy::operand_type Toi;
typedef typename strategy::result_type Tri;
+ typedef typename accumulate_buffer_type<strategy, OutputStage>::type Tab;
/* const properties set by constructor */
const CPUInfo * const _ci;
@@ -59,10 +260,15 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
const unsigned int _Msize;
const unsigned int _Nsize;
const unsigned int _Ksize;
+ const unsigned int _Ksections;
+ const unsigned int _Ktotal;
+ const unsigned int _rounded_Ksize;
const unsigned int _nbatches;
const unsigned int _nmulti;
+ const bool _thread_columns;
+
const Activation _act;
const int _maxthreads;
@@ -77,30 +283,59 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
const Toi *_B_transposed=nullptr;
void *_working_space=nullptr;
+ Tab *_accumulation_buffer=nullptr;
+
+ /* Output stage */
+ OutputStage _os;
+
+ /* Quantized support (in addition to 'output stage' above */
+ int32_t *col_bias = nullptr;
+
+ /* Indirect parameters. _indirect_buf doubles as a flag to indicate that "indirect" transform should be used. */
+ const To * const * const * _indirect_buf = nullptr;
+
+ /* Convolver - only set up for convolution problems, so also doubles as a flag. */
+ std::unique_ptr<convolver<To>> _convolver = nullptr;
+
+ unsigned int get_col_sum_size() const {
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ return _Nsize * _nmulti * sizeof(int32_t);
+ } else {
+ return 0;
+ }
+ }
+
/* We will need to walk through the blocks of B in a few contexts, so
* factor that out. */
class blockwalker {
private:
/* Size loops, etc. based on our parent's configuration */
- const GemmInterleaved<strategy, To, Tr> &_parent;
+ const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &_parent;
/* K, X and multi parameters for current iteration. */
unsigned int _k0=0, _x0=0, _multi=0;
+ /* Range of X to iterate over - used in "ForceThreadColumns" cases */
+ unsigned int _x_start=0;
+ unsigned int _x_end=_parent._Nsize;
+
unsigned int _index=0;
bool _done=false;
bool _newkblock=true;
bool _newmulti=true;
public:
- blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
+ blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent) : _parent(parent) { }
+
+ blockwalker(const GemmInterleaved<strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns> &parent,
+ unsigned int x_start, unsigned int x_end) : _parent(parent), _x0 (_x_start), _x_start(x_start), _x_end(x_end) { }
unsigned int xmax() {
- return std::min(_x0 + _parent._x_block, _parent._Nsize);
+ return std::min(_x0 + _parent._x_block, _x_end);
}
unsigned int kmax() {
- return std::min(_k0 + _parent._k_block, _parent._Ksize);
+ return std::min(_k0 + _parent._k_block, _parent._Ktotal);
}
/* Advance to the next block, return false at the end. */
@@ -111,10 +346,10 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
_newkblock=false;
_x0 += _parent._x_block;
- if (_x0 >= _parent._Nsize) {
- _x0=0;
+ if (_x0 >= _x_end) {
+ _x0=_x_start;
_k0 += _parent._k_block;
- if (_k0 >= _parent._Ksize) {
+ if (_k0 >= _parent._Ktotal) {
_k0=0;
_multi++;
if (_multi >= _parent._nmulti) {
@@ -138,14 +373,125 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
bool newkblock(void) { return _newkblock; }
};
- // A working size: One of these needed, regardless of thread count. Divided according to window.
+ // "k block" has two distinct uses: figuring out which iterations of K
+ // to actually process, but also various size/pointer computations. The
+ // latter needs to take account of the extra space needed for the row
+ // sums, if appropriate.
+ unsigned int get_total_k_depth() const {
+ unsigned int k_depth = _k_block;
+
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ k_depth += sizeof(int32_t) / sizeof(Toi);
+ }
+
+ return k_depth;
+ }
+
+ // A working size.
size_t get_a_working_size() const {
- return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
+ if (_thread_columns) {
+ // For 2D threading: allocate a buffer of one block of rows per thread
+ return ROUND_UP(sizeof(Toi) * get_total_k_depth() * strategy::out_height() * _maxthreads);
+ } else {
+ // For 1D threaded: one of these needed, regardless of thread count. Divided according to window.
+ return ROUND_UP(sizeof(Toi) * get_total_k_depth() * _Mround * _nbatches);
+ }
}
- // C working size: One needed per thread.
+ // C working size: One needed per thread. Not needed if there is no merge step.
size_t get_c_working_size() const {
- return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+ if (MergeStep) {
+ return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+ } else {
+ return 0;
+ }
+ }
+
+ // Accumulation buffer size
+ size_t get_accumulation_buffer_size() const {
+ // We only support an accumulation buffer for non-merge cases.
+ if (MergeStep) {
+ return 0;
+ }
+
+ // Check if we are actually blocking
+ if (_k_block == _Ktotal) {
+ return 0;
+ }
+
+ // We are no-merge, non-quantized with active blocking: accumulation buffer needed.
+ size_t size_per_buffer = sizeof(Tab) * strategy::out_height() * strategy::out_width();
+ size_t num_buffers = iceildiv(_Msize, strategy::out_height()) * iceildiv(_Nsize, strategy::out_width()) * _nbatches * _nmulti;
+
+ return num_buffers * size_per_buffer;
+ }
+
+ // Get pointer into accumulation buffer
+ Tab *get_accumulation_buffer(unsigned int M, unsigned int N, unsigned int batch, unsigned int multi) const {
+ // Don't do anything if there's no buffer.
+ if (_accumulation_buffer == nullptr) {
+ return nullptr;
+ }
+
+ // Here we are indexing an appropriately sized pointer, so no sizeof() needed to convert to bytes.
+ size_t size_per_buffer = strategy::out_height() * strategy::out_width();
+
+ size_t buffer_rows = iceildiv(_Msize, strategy::out_height());
+ size_t buffer_cols = iceildiv(_Nsize, strategy::out_width());
+ size_t buffers_per_batch = (buffer_rows * buffer_cols);
+ size_t buffers_per_multi = buffers_per_batch * _nbatches;
+
+ // M/N must reference the top-left corner of a block.
+ size_t row = M / strategy::out_height();
+ assert(M % strategy::out_height() == 0);
+ size_t col = N / strategy::out_width();
+ assert(N % strategy::out_width() == 0);
+
+ size_t buffer_index = multi * buffers_per_multi + batch * buffers_per_batch + row * buffer_cols + col;
+
+ return _accumulation_buffer + (buffer_index * size_per_buffer);
+ }
+
+ int32_t row_sum_multiplier() const {
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ const Requantize32 *qp = reinterpret_cast<const Requantize32 *>(&_os);
+
+ return -qp->b_offset;
+ }
+
+ return 0;
+ }
+
+ // Heuristics to decide whether to use the 'thread columns' regime
+ static bool is_thread_columns(const GemmArgs &args) {
+ // For now, there is a templace parameter to force it.
+ if (ForceThreadColumns) {
+ return true;
+ }
+
+ // Never do this for single threaded cases.
+ if (args._maxthreads == 1) {
+ return false;
+ }
+
+ // How many blocks of work are available for threading on M?
+ int m_blocks = iceildiv(args._Msize, strategy::out_height()) * args._nbatches;
+
+ // If we just can't share the work across threads with the row threading regime.
+ if (args._maxthreads > m_blocks) {
+ return true;
+ }
+
+ // If the row threading regime is too wasteful (20% threshold)
+ if (((roundup(m_blocks, args._maxthreads) * 100) / m_blocks) > 120) {
+ return true;
+ }
+
+ return false;
+ }
+
+ static unsigned int get_ktotal(const GemmArgs &args) {
+ return args._Ksections * roundup(args._Ksize, strategy::k_unroll());
}
static unsigned int get_k_block_size(const GemmArgs &args) {
@@ -153,6 +499,11 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
return args._cfg->inner_block_size;
}
+ // K blocking not supported if we are requantizing.
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ return get_ktotal(args);
+ }
+
const unsigned int L1_size = args._ci->get_L1_cache_size();
unsigned int k_block;
@@ -165,58 +516,84 @@ class GemmInterleaved : public GemmCommon<To, Tr> {
k_block = std::max(k_block, 1U) * strategy::k_unroll();
// Now tune to presented problem size; this is how many blocks we need.
- unsigned int num_k_blocks = iceildiv(args._Ksize, k_block);
+ unsigned int num_k_blocks = iceildiv(get_ktotal(args), k_block);
// So divide the space equally into that many blocks.
- k_block = iceildiv(args._Ksize, num_k_blocks);
+ k_block = iceildiv(get_ktotal(args), num_k_blocks);
// And round UP to the K unroll level required.
k_block = roundup(k_block, strategy::k_unroll());
+ assert(k_block > 0);
+
return k_block;
}
-public:
- GemmInterleaved(GemmInterleaved &) = delete;
- GemmInterleaved & operator= (GemmInterleaved &) = delete;
-
- /* Constructor */
- GemmInterleaved(const GemmArgs &args)
- : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
- _nbatches(args._nbatches), _nmulti(args._nmulti),
- _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
- _k_block(get_k_block_size(args)) {
- const unsigned int L2_size = _ci->get_L2_cache_size();
-
- assert(_maxthreads > 0);
+ static unsigned int get_x_block_size(const GemmArgs &args) {
+ if (is_thread_columns(args)) {
+ // In 2D mode, override X block, because we will process width first.
+ return roundup(args._Nsize, strategy::out_width());
+ }
- // Work out blocking parameters, or override from provided GemmConfig
- // TODO: Move outer block into a static function too.
if (args._cfg && args._cfg->outer_block_size) {
- _x_block = args._cfg->outer_block_size;
- } else {
- // x_block: Work out how many rows (of length k_block) will fit in the L2
- // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
- _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
- (sizeof(Toi) * _k_block);
+ return roundup(args._cfg->outer_block_size, strategy::out_width());
+ }
- // Needs to be (at least a single) multiple of the kernel output width.
- _x_block /= strategy::out_width();
- _x_block = std::max(_x_block, 1U) * strategy::out_width();
+ unsigned int x_block;
+ const unsigned int L2_size = args._ci->get_L2_cache_size();
+ const unsigned int k_block = get_k_block_size(args);
- // And tune to the presented problem size.
- unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
- _x_block = iceildiv(_Nsize, num_x_blocks);
+ // x_block: Work out how many rows (of length k_block) will fit in the L2
+ // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+ const unsigned int scaled_l2_size = (L2_size * 9) / 10;
+ const unsigned int k_block_area = k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height());
- _x_block = iceildiv(_x_block, strategy::out_width());
- _x_block *= strategy::out_width();
+ // .. if the L1 contents is bigger than the L2, just return a minimal size block.
+ if (k_block_area > scaled_l2_size) {
+ return strategy::out_width();
}
- // Work out the rounded size of M - needed for some buffers.
- _Mround = iceildiv(_Msize, strategy::out_height());
- _Mround *= strategy::out_height();
+ x_block = (scaled_l2_size - k_block_area) / (sizeof(Toi) * k_block);
+
+ // Needs to be (at least a single) multiple of the kernel output width.
+ x_block /= strategy::out_width();
+ x_block = std::max(x_block, 1u) * strategy::out_width();
+
+ // And tune to the presented problem size.
+ unsigned int num_x_blocks = iceildiv(args._Nsize, x_block);
+ x_block = iceildiv(args._Nsize, num_x_blocks);
+
+ x_block = roundup(x_block, strategy::out_width());
+
+ assert(x_block > 0);
+
+ return x_block;
}
+public:
+ GemmInterleaved(GemmInterleaved &) = delete;
+ GemmInterleaved & operator= (GemmInterleaved &) = delete;
+
+ /* Constructor */
+ GemmInterleaved(const GemmArgs &args, const OutputStage &os)
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
+ _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
+ _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
+ _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+ _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
+ _os(os) { }
+
+ /* Constructor without OutputStage */
+ GemmInterleaved(const GemmArgs &args)
+ : _ci(args._ci), _Msize(args._Msize), _Nsize(args._Nsize), _Ksize(args._Ksize),
+ _Ksections(args._Ksections), _Ktotal(get_ktotal(args)),
+ _rounded_Ksize(roundup(_Ksize, strategy::k_unroll())),
+ _nbatches(args._nbatches), _nmulti(args._nmulti), _thread_columns(is_thread_columns(args)),
+ _act(args._act), _maxthreads(args._maxthreads), _nthreads(args._maxthreads),
+ _k_block(get_k_block_size(args)), _x_block(get_x_block_size(args)), _Mround(roundup(args._Msize, strategy::out_height())),
+ _os() { }
+
// Interface implementation - Compulsory functions
// Window size: Only the last thread should do a ragged block, so dole
@@ -224,8 +601,14 @@ public:
// not multi for now (as this would cause problems with the buffer
// manager).
ndrange_t get_window_size() const override {
- // _Mround is a multiple of out_height by definition.
- return { (_Mround / strategy::out_height()) * _nbatches };
+ unsigned int row_blocks = (_Mround / strategy::out_height()) * _nbatches;
+
+ if (_thread_columns) {
+ return { row_blocks, iceildiv(_Nsize, strategy::out_width()) };
+ } else {
+ // _Mround is a multiple of out_height by definition.
+ return { row_blocks };
+ }
}
// set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -235,117 +618,262 @@ public:
// Execute
void execute(const ndcoord_t &work_range, const ndcoord_t &, int threadid) override {
- const auto start = work_range.get_position(0);
- const auto end = work_range.get_position_end(0);
#ifdef CYCLE_PROFILING
profiler prof;
#endif
+
+ /* Make sure we've been set up correctly. */
+ assert(_B_transposed);
+ assert(_working_space);
+ int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+ /* Align if needed */
+ intptr_t working_space_v = reinterpret_cast<intptr_t>(_working_space);
+ if (working_space_v & 0x3f) {
+ intptr_t alignment_offset = 0x40 - (working_space_v & 0x3f);
+ working_space_bytes += alignment_offset;
+ }
+
strategy strat(_ci);
- blockwalker current(*this);
+ const auto start = work_range.get_position(0);
+ const auto end = work_range.get_position_end(0);
/* Translate 'start' and 'end' into a position within the batches and rows. */
const unsigned int window_per_batch = _Mround / strategy::out_height();
unsigned int batch_0 = start / window_per_batch;
unsigned int batch_end = end / window_per_batch;
- /* Compute the M values to operate on */
- unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
- unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
+ // In ThreadColumns mode, process work one horizontal strip at a time.
+ // Transpose the block of needed rows at the start, then do all the work on that block.
+ if (_thread_columns) {
+ const auto start_x = work_range.get_position(1) * strategy::out_width();
+ const auto end_x = std::min(work_range.get_position_end(1) * strategy::out_width(), _Nsize);
- /* Make sure we've been set up correctly. */
- assert(_B_transposed);
- assert(_working_space);
- int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+ Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+ Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()) +
+ (threadid * sizeof(Toi) * get_total_k_depth() * strategy::out_height()));
- // Private buffers. Treat working_space as an array of C buffers
- // (one per thread) first, followed by the (window-divided) A
- // buffer.
- // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
- Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
- Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+ for (unsigned int multi=0; multi<_nmulti; multi++) {
+ for (unsigned int k0=0; k0<_Ktotal; k0+=_k_block) {
+ unsigned int kmax=std::min(k0+_k_block, _Ktotal);
- const Toi *b_panel;
- b_panel = _B_transposed;
+ unsigned int rounded_width = roundup(_Nsize, strategy::out_width());
- //printf("Starting GEMM loop, x_block=%d, k_block=%d\n", _x_block, _k_block);
+ const bool first_pass = (k0==0);
+ const bool last_pass = (kmax==_Ktotal);
- // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
- int kern_k = 0;
+ // Figure out how many "K" the kernel will actually process.
+ unsigned int kern_k = roundup(kmax - k0, strategy::k_unroll());
- for (;!current.done();current.advance()) {
- if (current.newkblock()) {
-#ifdef CYCLE_PROFILING
- auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
-#endif
- for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
- unsigned int first_m = (batch == batch_0) ? m_0 : 0;
- unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+ const Toi *b_ptr = _B_transposed + (rounded_width * _Ktotal * multi) + (k0 * rounded_width) + (start_x * kern_k);
- if (first_m >= last_m)
- continue;
+ unsigned int batch = batch_0;
+ unsigned int start_row = (start - (batch_0 * window_per_batch)) * strategy::out_height();
- strat.transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * _k_block),
- this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
- this->_lda, first_m, last_m, current.k0(), current.kmax());
- }
+ for (unsigned int p=start; p<end; p++) {
+ unsigned int end_row = std::min(start_row + strategy::out_height(), _Msize);
- // Figure out how many "K" the kernel will actually process.
- kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
- kern_k *= strat.k_unroll();
+ // Set up transposed 'A' block
+ {
+#ifdef CYCLE_PROFILING
+ auto p=prof.ScopedProfiler(PROFILE_PREPA, strategy::out_height() * (kmax-k0) * sizeof(Toi));
+#endif
+ // See comment above on transform_type<> class: this extracts either 'transforms' or
+ // 'transforms_quantized' as appropriate.
+ typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+ if (_indirect_buf != nullptr) {
+ transforms.PrepareA_indirect(a_panel,
+ _indirect_buf + (multi * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
+ _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
+ } else if (_convolver) {
+ transforms.PrepareA_convolution(a_panel,
+ this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
+ this->_lda, *_convolver, _rounded_Ksize, start_row, end_row, k0, kmax, row_sum_multiplier());
+ } else {
+ transforms.PrepareA(a_panel,
+ this->_Aptr + (batch * this->_A_batch_stride) + (multi * this->_A_multi_stride),
+ this->_lda, start_row, end_row, k0, std::min(kmax, _Ksize), row_sum_multiplier());
+ }
+ }
+
+ // Perform the kernel and merge step, either separately or together as required.
+ kernel_and_merge<MergeStep, OutputStage>::run(
+ #ifdef CYCLE_PROFILING
+ prof,
+ #endif
+ // Strategy and panel pointers
+ strat, a_panel, b_ptr, c_panel,
+ // Result buffer pointers
+ this->_Cptr + (batch * this->_C_batch_stride) + (multi * this->_C_multi_stride), this->_ldc,
+ // K size, and M/N ranges
+ kern_k, start_row, end_row, start_x, end_x,
+ // Only do bias on the first pass
+ ((first_pass && this->_bias) ? this->_bias + (multi * this->_bias_multi_stride) : nullptr),
+ // Only do activation on the last pass, and accumulation on any non-first pass.
+ (last_pass ? _act : Activation()), !first_pass,
+ // Pass in quantization parameters for requantizing kernels (others will ignore)
+ _os, col_bias + (multi * _Nsize),
+ // Accumulation buffer (not yet implemented on this path)
+ static_cast<Tab *>(nullptr));
+
+ /* Increment to the next block */
+ start_row += strategy::out_height();
+ if (start_row >= _Msize) {
+ start_row = 0;
+ batch++;
+ }
+ }
+ }
}
+ } else {
+ blockwalker current(*this);
+
+ /* Compute the M values to operate on */
+ unsigned int m_0 = (start - (batch_0 * window_per_batch)) * strategy::out_height();
+ unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height();
+
+ // Private buffers. Treat working_space as an array of C buffers
+ // (one per thread) first, followed by the (window-divided) A
+ // buffer.
+ // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
+ Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+ Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
- int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+ const Toi *b_panel;
+ b_panel = _B_transposed;
- /* Do the actual work. */
- for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
- unsigned int first_m = (batch == batch_0) ? m_0 : 0;
- unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+ // newkblock() is always true on the first iteration, so these will be set properly on the first loop.
- const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+ // kern_k tracks the accumulation depth for the CURRENT K block a_panel_stride similarly tracks the total
+ // stride of the A panel (i.e. with 4 added for cases with embedded row sums)
- if (first_m >= last_m)
- continue;
+ // These are distinct from k_block and get_total_k_depth() which are based on the target K block size, and
+ // used for addressing inside a_panel.
- for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
- unsigned int ymax = std::min(_Msize, y + strategy::out_height());
+ // In cases where K blocking is in use and the blocks are not all the same size, the (smaller) final block
+ // won't use all the memory allocated.
+ unsigned int kern_k = 0;
+ unsigned int a_panel_stride = 0;
- {
+ for (;!current.done();current.advance()) {
+ if (current.newkblock()) {
#ifdef CYCLE_PROFILING
- auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height() * bblocks * strategy::out_width() * kern_k));
+ auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height() * (current.kmax()-current.k0()) * sizeof(Toi));
#endif
+ // See comment above on transform_type<> class: this extracts either 'transforms' or
+ // 'transforms_quantized' as appropriate.
+ typename transform_type<strategy, MergeStep && std::is_same<OutputStage, Requantize32>::value>::type transforms;
+
+ for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ if (first_m >= last_m)
+ continue;
+
+ if (_indirect_buf != nullptr) {
+ transforms.PrepareA_indirect(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+ _indirect_buf + (current.multi() * _nbatches * _Ksections) + (batch * _Ksections), _Ksize,
+ _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
+ } else if (_convolver) {
+ transforms.PrepareA_convolution(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+ this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+ this->_lda, *_convolver, _rounded_Ksize, first_m, last_m, current.k0(), current.kmax(), row_sum_multiplier());
+ } else {
+ transforms.PrepareA(a_panel + ((batch * _Mround + first_m) * get_total_k_depth()),
+ this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+ this->_lda, first_m, last_m, current.k0(), std::min(_Ksize, current.kmax()), row_sum_multiplier());
+ }
+ }
+
+ // Figure out how many "K" the kernel will actually process.
+ kern_k = roundup(current.kmax() - current.k0(), strategy::k_unroll());
- strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+ // Requantizing GEMMs have the row sums built in to the
+ // transposed data, so the stride between rows is 4 bytes
+ // larger than the (rounded) K value.
- a_ptr += (strategy::out_height() * kern_k);
+ if(std::is_same<OutputStage, Requantize32>::value) {
+ a_panel_stride = kern_k + (sizeof(int32_t) / sizeof(Toi));
+ } else {
+ a_panel_stride = kern_k;
}
+ }
- {
-#ifdef CYCLE_PROFILING
- auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height() * bblocks * strategy::out_width() * sizeof(Tr)));
-#endif
- /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
- const bool first_pass = current.k0()==0;
- const bool last_pass = current.kmax()==_Ksize;
-
- strat.transforms.Merge(this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
- c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
- ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
- (last_pass ? _act : Activation()), !first_pass);
+ /* Do the actual work. */
+ for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+ unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+ unsigned int last_m = (batch == batch_end) ? m_max : _Msize;
+
+ const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * get_total_k_depth();
+
+ if (first_m >= last_m)
+ continue;
+
+ // For the merge case we need to do this out_height() rows
+ // at a time, as that is the size of our intermediate
+ // buffer. If we are not doing that, we can do all the
+ // relevant rows in one go.
+ unsigned int m_step = MergeStep ? strategy::out_height() : (last_m - first_m);
+
+ // But in the case where we have an accumulation buffer, we can't do that after all, unless
+ // there is no N blocking.
+ if (_accumulation_buffer && ((current.x0() != 0) || (current.xmax() < _Nsize))) {
+ m_step = strategy::out_height();
+ }
+
+ for (unsigned int y=first_m; y<last_m; y+=m_step) {
+ unsigned int ymax = std::min(_Msize, y + m_step);
+
+ const bool first_pass = (current.k0() == 0);
+ const bool last_pass = (current.kmax() == _Ktotal);
+
+ // Pointer to appropriate part of result array.
+ Tr *result_ptr = this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride);
+
+ // If we are using an accumulation buffer, we don't pass the result buffer to ask the kernel
+ // to write things into the accumulation buffer instead, except on the last pass.
+ if (_accumulation_buffer && !last_pass) {
+ result_ptr = nullptr;
+ }
+
+ // Perform the kernel and merge step, either separately or together as required.
+ kernel_and_merge<MergeStep, OutputStage>::run(
+ #ifdef CYCLE_PROFILING
+ prof,
+ #endif
+ // Strategy and panel pointers
+ strat, a_ptr, b_panel, c_panel,
+ // Result buffer pointers
+ result_ptr, this->_ldc,
+ // K size, and M/N ranges
+ kern_k, y, ymax, current.x0(), current.xmax(),
+ // Only do bias on the first pass
+ ((first_pass && this->_bias) ? this->_bias + (current.multi() * this->_bias_multi_stride) : nullptr),
+ // Only do activation on the last pass, and accumulation on any non-first pass.
+ (last_pass ? _act : Activation()), !first_pass,
+ // Pass in quantization parameters for requantizing kernels (others will ignore)
+ _os, col_bias + (current.multi() * _Nsize),
+ // Accumulation buffer
+ get_accumulation_buffer(y, current.x0(), batch, current.multi()) );
+
+ a_ptr += (strategy::out_height() * a_panel_stride);
}
}
- }
- b_panel += (bblocks * strat.out_width() * kern_k);
+ b_panel += (roundup(current.xmax() - current.x0(), strategy::out_width()) * kern_k);
+ }
}
}
// Interface implementation - working space
size_t get_working_size() const override {
- // In all cases, we need one A buffer plus a C buffer per thread.
- size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
+ // In all cases, we need one A buffer plus a C buffer per thread, plus an accumulation buffer.
+ size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads) + get_accumulation_buffer_size();
- size += 64; // Add on a cache line extra for alignment.
+ size += 128; // Add on two cache lines extra for alignment.
return size;
}
@@ -362,9 +890,22 @@ public:
}
working_space_bytes += diff;
+ working_space_int += diff;
// Pretransposed case: just set internal pointer to parameter value.
_working_space = reinterpret_cast<void *>(working_space_bytes);
+
+ // Set up accumulation buffer
+ if (get_accumulation_buffer_size() > 0) {
+ intptr_t acc_buff_int = working_space_int + get_a_working_size() + (get_c_working_size() * _maxthreads);
+ // Make sure the accumulation buffer is aligned (needed if the other blocks are not a multiple of cache line length)
+ if (acc_buff_int & 0x3F) {
+ acc_buff_int += (0x40 - (acc_buff_int & 0x3F));
+ }
+ _accumulation_buffer = reinterpret_cast<Tab *>(acc_buff_int);
+ } else {
+ _accumulation_buffer = nullptr;
+ }
}
// Interface implementation - pretransposed
@@ -376,56 +917,105 @@ public:
return (_B_transposed==nullptr);
}
- // TODO: this could almost certainly be considerably simpler.
size_t get_B_pretransposed_array_size() const override {
- size_t total=0;
- blockwalker current(*this);
+ unsigned int x_size = roundup(_Nsize, strategy::out_width());
- do {
- /* Figure out the size of each block. */
- unsigned int x_size = (current.xmax() - current.x0());
- unsigned int k_size = (current.kmax() - current.k0());
+ return (x_size * _Ktotal * _nmulti * sizeof(Toi)) + get_col_sum_size();
+ }
- /* Round sizes up as needed. */
- x_size = iceildiv(x_size, strategy::out_width());
- x_size *= strategy::out_width();
+ void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ col_bias = reinterpret_cast<int32_t *>(in_buffer);
- k_size = iceildiv(k_size, strategy::k_unroll());
- k_size *= strategy::k_unroll();
+ Requantize32 *qp_ptr = reinterpret_cast<Requantize32 *>(&_os);
- total += x_size * k_size * sizeof(Toi);
- } while (current.advance());
+ for (unsigned int i=0; i<_nmulti; i++) {
+ // The input is assumed not to have any padding between sections, so straightforward Ksize * Ksections computation gets the total size.
+ compute_col_sums(*qp_ptr, _Nsize, _Ksize * _Ksections, B + (i * B_multi_stride), ldb, col_bias + (i * _Nsize), _Ksize * _Ksections, i, 0);
+ }
+ }
- return total;
- }
+ // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+ uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+ Toi *buffer = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+ _B_transposed = buffer;
- void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
blockwalker current(*this);
- Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
- _B_transposed = buffer;
strategy strat(_ci);
do {
/* Figure out the size of each block. */
- unsigned int x_size = (current.xmax() - current.x0());
unsigned int k_size = (current.kmax() - current.k0());
- /* Round sizes up as needed. */
- x_size = iceildiv(x_size, strategy::out_width());
- x_size *= strategy::out_width();
+ // We need to insert padding at the end of each K section.
+ // The computation needed is a little delicate - the coordinates from the block walker are expressed in
+ // terms of the full, padded, _Ktotal.
+ // But we need to transform each section with reference to the original, unpadded, input, letting the
+ // transform pad each section as needed.
+
+ // This is needed for computations below.
+ const unsigned int rounded_section_size = roundup(_Ksize, strategy::k_unroll());
+
+ // The expected output format is also an entire <out_width> columns interleaved, then the next set of
+ // columns, and so on. This means, as we are breaking it up vertically, we have to do it one column at
+ // a time.
+ for (unsigned int x0=current.x0(); x0 < current.xmax(); x0 += strategy::out_width() ){
+ unsigned int xmax = std::min(x0 + strategy::out_width(), current.xmax());
+
+ // Track where we are and how much work is left.
+ unsigned int kpos = current.k0();
+ unsigned int kleft = k_size;
+
+ while (kleft) {
+ // Which section are we in? Based on the rounded-up section size.
+ unsigned int k_section_base = kpos / rounded_section_size;
+ // How far into the section are we?
+ unsigned int k_offset = kpos - (k_section_base * rounded_section_size);
+
+ // We will either copy the rest of this section, or to the end of the requested length.
+ unsigned int k_length = std::min(_Ksize - k_offset, kleft);
+
+ strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+ x0, xmax,
+ (k_section_base * _Ksize) + k_offset, // K starting point - compute row to read based on our section and the true section length.
+ (k_section_base * _Ksize) + k_offset + k_length); // K end point - starting point plus length computed above.
- k_size = iceildiv(k_size, strategy::k_unroll());
- k_size *= strategy::k_unroll();
+ // We need to modify our position based on the ROUNDED version of what we just did.
+ unsigned int padded_length = roundup(k_length, strategy::k_unroll());
- strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
- current.x0(), current.xmax(), current.k0(), current.kmax());
+ buffer += strategy::out_width() * padded_length;
- buffer += (x_size * k_size);
+ kpos += padded_length;
+ kleft -= padded_length;
+ }
+ }
} while (current.advance());
}
void set_pretransposed_B_data(void *in_buffer) override {
- _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+ // Put the transposed data after the column sums - in non-transposing cases get_col_sum_size() == 0
+ uintptr_t buffer_int = reinterpret_cast<uintptr_t>(in_buffer);
+ _B_transposed = reinterpret_cast<Toi *>(buffer_int + get_col_sum_size());
+ col_bias = reinterpret_cast<int32_t *>(in_buffer);
+ }
+
+ void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride) override {
+ if (std::is_same<OutputStage, Requantize32>::value) {
+ Requantize32 *qp = reinterpret_cast<Requantize32 *>(&_os);
+
+ qp->bias = bias;
+ qp->bias_multi_stride = bias_multi_stride;
+ }
+ }
+
+ void set_indirect_parameters(size_t string_len, const To * const * const *ptr) override {
+ assert(string_len == _Ksize);
+ _indirect_buf = ptr;
+ }
+
+ void set_convolution_parameters(ConvolutionParameters parms) override {
+ assert(parms.input_channels == _Ksize);
+ _convolver = std::unique_ptr<convolver<To>>(new convolver<To>(parms));
}
// Estimate cycles for given problem given provided parameters
@@ -454,4 +1044,14 @@ public:
}
};
+// Aliases for the variations
+template<typename strategy, typename To, typename Tr, typename OutputStage=Nothing>
+using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>;
+
+template<typename strategy, typename To, typename Tr>
+using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>;
+
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
index bdccd05326..b71f390ab9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -250,7 +250,8 @@ class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
first_m,
last_m,
current.k0(),
- current.kmax());
+ current.kmax(),
+ 0);
}
}
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
index 04cac6095c..05c5116bf3 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_qint8.cpp
@@ -25,68 +25,151 @@
#include "arm_gemm.hpp"
-#include "kernels/a64_hybrid_s8s32_dot_16x4.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_s8s32_dot_4VLx4.hpp"
-#include "kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_s16_8x12.hpp"
+#include "kernels/a64_gemm_s8_4x4.hpp"
+#include "kernels/a64_gemm_s8_8x12.hpp"
+#include "kernels/a64_hybrid_s8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_s8qs_dot_6x16.hpp"
+#include "kernels/a64_hybrid_s8s32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_s8s32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp"
+#include "kernels/sve_hybrid_s8s32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_s8qa_dot_4x4VL.hpp"
+#include "kernels/sve_hybrid_s8qs_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_s8s32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp"
+
+#include "gemm_hybrid_indirect.hpp"
#include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
+#include "gemm_interleaved.hpp"
#include "quantize_wrapper.hpp"
+#include "utils.hpp"
namespace arm_gemm {
static const GemmImplementation<int8_t, int8_t, Requantize32> gemm_qint8_methods[] =
{
#ifdef __ARM_FEATURE_SVE
+#ifdef MMLA_INT8
{
- GemmMethod::GEMM_HYBRID_QUANTIZED,
- "smallK_hybrid_s8s32_dot_1VLx8",
- [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
+ GemmMethod::GEMM_INTERLEAVED,
+ "sve_interleaved_s8s32_mmla_8x3VL",
+ [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
nullptr,
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_1VLx8, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_mmla_8x3VL, int8_t, int8_t>(args, qp); }
},
+#endif
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
- "hybrid_s8s32_dot_4VLx4",
- [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
- [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_4VLx4, int8_t, int8_t>(args, qp); }
+ "sve_smallK_hybrid_s8s32_dot_8x1VL",
+ [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_s8s32_dot_8x1VL, int8_t, int8_t>(args, qp); }
+},
+#ifdef SVE2
+{
+ GemmMethod::GEMM_HYBRID,
+ "sve_hybrid_s8qs_dot_6x4VL",
+ [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_symmetric(qp); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qs_dot_6x4VL, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "sve_hybrid_s8qa_dot_4x4VL",
+ [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8qa_dot_4x4VL, int8_t, int8_t, Requantize32>(args, qp); }
},
#endif
{
- GemmMethod::GEMM_HYBRID_QUANTIZED,
- "smallK_hybrid_s8s32_dot_4x8",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+ GemmMethod::GEMM_HYBRID,
+ "sve_hybrid_s8s32_dot_6x4VL",
+ nullptr,
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_s8s32_dot_6x4VL, int8_t, int8_t, Requantize32, true>(args, qp); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "sve_interleaved_s8s32_dot_8x3VL",
+ [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); },
nullptr,
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x8, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_s8s32_dot_8x3VL, int8_t, int8_t>(args, qp); }
},
+#endif // SVE
+#ifdef MMLA_INT8
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_interleaved_s8s32_mmla_8x12",
+ [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_s8s32_mmla_8x12, int8_t, int8_t>(args, qp); }
+},
+#endif
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
- "smallK_hybrid_s8s32_dot_4x6",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+ "a64_smallK_hybrid_s8s32_dot_8x4",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_s8s32_dot_4x6, int8_t, int8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_8x4, int8_t, int8_t>(args, qp); }
},
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
- "hybrid_s8s32_dot_16x4",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
- [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_s8s32_dot_16x4, int8_t, int8_t>(args, qp); }
+ "a64_smallK_hybrid_s8s32_dot_6x4",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_s8s32_dot_6x4, int8_t, int8_t>(args, qp); }
},
-/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
{
- GemmMethod::QUANTIZE_WRAPPER_2D,
- "quantized_wrapper_2d",
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_gemm_s16_8x12",
nullptr,
- [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
- [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s16_8x12, int8_t, int8_t>(args, qp); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_s8qs_dot_6x16",
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_symmetric(qp); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qs_dot_6x16, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_s8qa_dot_4x16",
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8qa_dot_4x16, int8_t, int8_t, Requantize32>(args, qp); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_s8s32_dot_6x16",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_s8s32_dot_6x16, int8_t, int8_t, Requantize32, true>(args, qp); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_gemm_s8_8x12",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_8x12, int8_t, int8_t>(args, qp); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_gemm_s8_4x4",
+ nullptr,
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_s8_4x4, int8_t, int8_t>(args, qp); }
},
{
GemmMethod::QUANTIZE_WRAPPER,
"quantized_wrapper",
- nullptr,
+ [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
nullptr,
[](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<int8_t, int8_t, int32_t>(args, qp); }
},
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
index 0125f9c5db..7342fda5d1 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_quint8.cpp
@@ -25,13 +25,25 @@
#include "arm_gemm.hpp"
-#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
+#include "kernels/a64_gemm_u8_4x4.hpp"
+#include "kernels/a64_gemm_u8_8x12.hpp"
+#include "kernels/a64_hybrid_u8qa_dot_4x16.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_hybrid_u8qa_dot_4x4VL.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
+
+#include "gemm_hybrid_indirect.hpp"
#include "gemm_hybrid_quantized.hpp"
+#include "gemm_hybrid_quantized_inline.hpp"
+#include "gemm_interleaved.hpp"
#include "quantize_wrapper.hpp"
namespace arm_gemm {
@@ -39,54 +51,108 @@ namespace arm_gemm {
static const GemmImplementation<uint8_t, uint8_t, Requantize32> gemm_quint8_methods[] =
{
#ifdef __ARM_FEATURE_SVE
+#ifdef MMLA_INT8
{
- GemmMethod::GEMM_HYBRID_QUANTIZED,
- "smallK_hybrid_u8u32_dot_1VLx8",
- [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64; },
+ GemmMethod::GEMM_INTERLEAVED,
+ "sve_interleaved_u8u32_mmla_8x3VL",
+ [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
nullptr,
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint8_t>(args, qp); }
},
+#endif
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
- "hybrid_u8u32_dot_4VLx4",
- [](const GemmArgs &args, const Requantize32 &) { return args._Ksize>=16; },
- [](const GemmArgs &args, const Requantize32 &) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_4VLx4, uint8_t, uint8_t>(args, qp); }
+ "sve_smallK_hybrid_u8u32_dot_8x1VL",
+ [](const GemmArgs &args, const Requantize32 &) { return args._Ksize<=64 && !args._indirect_input; },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint8_t>(args, qp); }
+},
+#ifdef SVE2 // Requantizing kernels include some SVE2 only instructions (SQRDMULH, SRSHL)
+{
+ GemmMethod::GEMM_HYBRID,
+ "sve_hybrid_u8qa_dot_4x4VL",
+ [](const GemmArgs &args, const Requantize32 &qp) { return quant_hybrid_asymmetric(qp); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8qa_dot_4x4VL, uint8_t, uint8_t, Requantize32>(args, qp); }
},
#endif
{
- GemmMethod::GEMM_HYBRID_QUANTIZED,
- "smallK_hybrid_u8u32_dot_4x8",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+ GemmMethod::GEMM_HYBRID,
+ "sve_hybrid_u8u32_dot_6x4VL",
nullptr,
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint8_t>(args, qp); }
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "sve_interleaved_u8u32_dot_8x3VL",
+ [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>4); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint8_t>(args, qp); }
},
+#endif
+#ifdef MMLA_INT8
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_interleaved_u8u32_mmla_8x12",
+ [](const GemmArgs &args, const Requantize32 &) { return (args._Ksize>8); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint8_t>(args, qp); }
+},
+#endif
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
- "smallK_hybrid_u8u32_dot_4x6",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+ "a64_smallK_hybrid_u8u32_dot_8x4",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint8_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint8_t>(args, qp); }
},
{
GemmMethod::GEMM_HYBRID_QUANTIZED,
- "hybrid_u8u32_dot_16x4",
- [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && args._Ksize>=16; },
- [](const GemmArgs &args, const Requantize32 &) { return ((args._Nsize<=256) && (args._Ksize>128)) || (args._maxthreads >= 8); },
- [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<hybrid_u8u32_dot_16x4, uint8_t, uint8_t>(args, qp); }
+ "a64_smallK_hybrid_u8u32_dot_6x4",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridQuantized<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint8_t>(args, qp); }
},
-/** QUANTIZE_WRAPPER_2D enables 2D parallelisation hint for IScheduler in NEGEMMAssemblyDispatch */
{
- GemmMethod::QUANTIZE_WRAPPER_2D,
- "quantized_wrapper_2d",
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_gemm_u16_8x12",
nullptr,
- [](const GemmArgs &args, const Requantize32 &) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8);},
- [](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->get_cpu_model() == CPUModel::A53; },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u16_8x12, uint8_t, uint8_t>(args, qp); },
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_u8qa_dot_4x16",
+ [](const GemmArgs &args, const Requantize32 &qp) { return args._ci->has_dotprod() && quant_hybrid_asymmetric(qp); },
+ [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8qa_dot_4x16, uint8_t, uint8_t, Requantize32>(args, qp); }
+},
+{
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_u8u32_dot_6x16",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+ [](const GemmArgs &args, const Requantize32 &) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint8_t, Requantize32, true>(args, qp); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_gemm_u8_8x12",
+ [](const GemmArgs &args, const Requantize32 &) { return args._ci->has_dotprod(); },
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_8x12, uint8_t, uint8_t>(args, qp); }
+},
+{
+ GemmMethod::GEMM_INTERLEAVED,
+ "a64_gemm_u8_4x4",
+ nullptr,
+ nullptr,
+ [](const GemmArgs &args, const Requantize32 &qp) { return new GemmInterleavedQuantized<cls_a64_gemm_u8_4x4, uint8_t, uint8_t>(args, qp); }
},
{
GemmMethod::QUANTIZE_WRAPPER,
"quantized_wrapper",
- nullptr,
+ [](const GemmArgs &args, const Requantize32 &) { return !args._indirect_input; },
nullptr,
[](const GemmArgs &args, const Requantize32 &qp) { return new QuantizeWrapper<uint8_t, uint8_t, uint32_t>(args, qp); }
},
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
index 5e06443e19..10a35e7a11 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint16.cpp
@@ -28,17 +28,17 @@
#include "gemm_implementation.hpp"
#include "gemm_interleaved.hpp"
-#include "kernels/a64_gemm_u16_12x8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
namespace arm_gemm {
static const GemmImplementation<uint16_t, uint32_t> gemm_u16_methods[] = {
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_u16_12x8",
+ "a64_gemm_u16_8x12",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint16_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint16_t, uint32_t>(args); }
},
{
GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
index 06e68cbc43..c300b8cdf9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_uint8.cpp
@@ -29,18 +29,20 @@
#include "gemm_interleaved.hpp"
#include "gemm_interleaved_pretransposed_2d.hpp"
#include "gemm_hybrid.hpp"
+#include "gemm_hybrid_indirect.hpp"
-#include "kernels/a64_gemm_u16_12x8.hpp"
-#include "kernels/a64_gemm_u8_12x8.hpp"
+#include "kernels/a64_gemm_u16_8x12.hpp"
#include "kernels/a64_gemm_u8_4x4.hpp"
-#include "kernels/a64_hybrid_u8u32_dot_16x4.hpp"
-#include "kernels/a64_interleaved_u8u32_mmla_12x8.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp"
-#include "kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp"
-#include "kernels/sve_hybrid_u8u32_dot_4VLx4.hpp"
-#include "kernels/sve_interleaved_u8u32_dot_3VLx8.hpp"
-#include "kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp"
-#include "kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp"
+#include "kernels/a64_gemm_u8_8x12.hpp"
+#include "kernels/a64_hybrid_u8u32_dot_6x16.hpp"
+#include "kernels/a64_interleaved_u8u32_mmla_8x12.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp"
+#include "kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp"
+
+#include "kernels/sve_hybrid_u8u32_dot_6x4VL.hpp"
+#include "kernels/sve_interleaved_u8u32_dot_8x3VL.hpp"
+#include "kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp"
+#include "kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp"
namespace arm_gemm {
@@ -49,106 +51,84 @@ static const GemmImplementation<uint8_t, uint32_t> gemm_u8_methods[] = {
#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_u8u32_mmla_3VLx8",
+ "sve_interleaved_u8u32_mmla_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>8); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_mmla_3VLx8, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_mmla_8x3VL, uint8_t, uint32_t>(args); }
},
#endif
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_u8u32_dot_1VLx8",
- [](const GemmArgs &args) { return args._Ksize<=64; },
+ "smallK_hybrid_u8u32_dot_8x1VL",
+ [](const GemmArgs &args) { return args._Ksize<=64 && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_1VLx8, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_sve_smallK_hybrid_u8u32_dot_8x1VL, uint8_t, uint32_t>(args); }
},
{
GemmMethod::GEMM_HYBRID,
- "hybrid_u8u32_dot_4VLx4",
- [](const GemmArgs &args) { return args._Ksize>=16; },
+ "sve_hybrid_u8u32_dot_6x4VL",
+ nullptr,
[](const GemmArgs &args) { return ((args._Ksize <= 128) && (args._Nsize <= 128)) || ((args._nmulti > 1) && ((args._Msize / args._maxthreads) < 8)); },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_4VLx4, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_sve_hybrid_u8u32_dot_6x4VL, uint8_t, uint32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_u8u32_dot_3VLx8",
+ "sve_interleaved_u8u32_dot_8x3VL",
[](const GemmArgs &args) { return (args._Ksize>4); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_dot_3VLx8, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_sve_interleaved_u8u32_dot_8x3VL, uint8_t, uint32_t>(args); }
},
#endif
#ifdef MMLA_INT8
{
GemmMethod::GEMM_INTERLEAVED,
- "interleaved_u8u32_mmla_12x8",
+ "a64_interleaved_u8u32_mmla_8x12",
[](const GemmArgs &args) { return (args._Ksize>8); },
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<interleaved_u8u32_mmla_12x8, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_interleaved_u8u32_mmla_8x12, uint8_t, uint32_t>(args); }
},
#endif
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_u8u32_dot_4x8",
- [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32); },
+ "a64_smallK_hybrid_u8u32_dot_8x4",
+ [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize<=32) && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x8, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_8x4, uint8_t, uint32_t>(args); }
},
{
GemmMethod::GEMM_HYBRID,
- "smallK_hybrid_u8u32_dot_4x6",
- [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64); },
+ "a64_smallK_hybrid_u8u32_dot_6x4",
+ [](const GemmArgs &args) { return args._ci->has_dotprod() && (args._Nsize % 4 == 0) && (args._Ksize>32) && (args._Ksize<=64) && !args._indirect_input; },
nullptr,
- [](const GemmArgs &args) { return new GemmHybrid<smallK_hybrid_u8u32_dot_4x6, uint8_t, uint32_t>(args); }
-},
-{
- GemmMethod::GEMM_HYBRID,
- "hybrid_u8u32_dot_16x4",
- [](const GemmArgs &args) { return args._ci->has_dotprod() && args._Ksize>=16; },
- [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
- [](const GemmArgs &args) { return new GemmHybrid<hybrid_u8u32_dot_16x4, uint8_t, uint32_t>(args); }
-},
-{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "gemm_u8_12x8_2d",
- [](const GemmArgs &args) { return args._ci->has_dotprod(); },
- [](const GemmArgs &args) { return (args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8) ; },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmHybrid<cls_a64_smallK_hybrid_u8u32_dot_6x4, uint8_t, uint32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_u8_12x8_1d",
- [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+ "a64_gemm_u16_8x12",
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_12x8, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53; },
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u16_8x12, uint8_t, uint32_t>(args); },
},
{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "gemm_u16_12x8_2d",
- nullptr,
- [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4 && (args._Msize / args._maxthreads) < 8; },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u16_12x8, uint8_t, uint32_t>(args); },
+ GemmMethod::GEMM_HYBRID,
+ "a64_hybrid_u8u32_dot_6x16",
+ [](const GemmArgs &args) { return args._ci->has_dotprod(); },
+ [](const GemmArgs &args) { return args._Nsize<=256 && args._Ksize>128; },
+ [](const GemmArgs &args) { return new GemmHybridIndirect<cls_a64_hybrid_u8u32_dot_6x16, uint8_t, uint32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_u16_12x8_1d",
- nullptr,
- [](const GemmArgs &args) { return args._ci->get_cpu_model() == CPUModel::A53 && args._Msize > 4; },
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_u16_12x8, uint8_t, uint32_t>(args); },
-},
-{
- GemmMethod::GEMM_INTERLEAVED_2D,
- "gemm_u8_4x4_2d",
+ "a64_gemm_u8_8x12",
+ [](const GemmArgs &args) { return args._ci->has_dotprod(); },
nullptr,
- [](const GemmArgs &args) { return ((args._maxthreads >= 8) && (args._Msize >= 8) && (args._Nsize >= 8)) ||
- ((args._Msize / args._maxthreads) < 4); },
- [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_8x12, uint8_t, uint32_t>(args); }
},
{
GemmMethod::GEMM_INTERLEAVED,
- "gemm_u8_4x4_1d",
+ "a64_gemm_u8_4x4",
nullptr,
nullptr,
- [](const GemmArgs &args) { return new GemmInterleaved<gemm_u8_4x4, uint8_t, uint32_t>(args); }
+ [](const GemmArgs &args) { return new GemmInterleaved<cls_a64_gemm_u8_4x4, uint8_t, uint32_t>(args); }
},
{
GemmMethod::DEFAULT,
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 47909cdaeb..9de44fcb73 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -46,46 +46,39 @@ class GemvPretransposed : public GemmCommon<To, Tr> {
typedef typename strategy::operand_type Toi;
typedef typename strategy::result_type Tri;
- const unsigned int _Nsize;
- const unsigned int _Ksize;
-
- const unsigned int _nmultis;
-
- const Activation _act;
-
- const CPUInfo * const _ci;
+ const GemmArgs _args;
const unsigned int _buffer_per_multi;
- unsigned int m_block=0;
+ unsigned int k_block=0;
unsigned int n_block=0;
- const Toi *_A_pretransposed = nullptr;
+ const Toi *_B_pretransposed = nullptr;
public:
GemvPretransposed(GemvPretransposed &) = delete;
GemvPretransposed & operator= (GemvPretransposed &) = delete;
GemvPretransposed(const GemmArgs &args)
- : _Nsize(args._Nsize), _Ksize(args._Ksize), _nmultis(args._nmulti), _act(args._act), _ci(args._ci),
- _buffer_per_multi(_Ksize * iceildiv(_Nsize, strategy::A_interleave()) * strategy::A_interleave()) {
+ : _args(args),
+ _buffer_per_multi(args._Ksize * roundup(args._Nsize, strategy::out_width())) {
/* For now don't do any blocking. TODO: figure out if we should. */
- if (args._cfg && args._cfg->inner_block_size) {
- m_block = args._cfg->inner_block_size;
+ if (strategy::supports_accumulate() && args._cfg && args._cfg->inner_block_size) {
+ k_block = args._cfg->inner_block_size;
} else {
- m_block = _Ksize;
+ k_block = args._Ksize;
}
if (args._cfg && args._cfg->outer_block_size) {
n_block = args._cfg->outer_block_size;
} else {
- n_block = _Nsize;
+ n_block = args._Nsize;
}
}
// Window is number of out_width blocks, times number of multis.
ndrange_t get_window_size() const override {
- return { iceildiv(_Nsize, strategy::out_width()) * _nmultis };
+ return { iceildiv(_args._Nsize, strategy::out_width()) * _args._nmulti };
}
// Actually execute the GEMV.
@@ -93,13 +86,13 @@ public:
#ifdef CYCLE_PROFILING
profiler prof;
#endif
- strategy strat(_ci);
+ strategy strat(_args._ci);
const auto start = work_range.get_position(0);
const auto end = work_range.get_position_end(0);
/* Break the window values down into multis of interest... */
- const unsigned int window_per_multi = iceildiv(_Nsize, strategy::out_width());
+ const unsigned int window_per_multi = iceildiv(_args._Nsize, strategy::out_width());
const unsigned int multi_0 = start / window_per_multi;
const unsigned int multi_end = end / window_per_multi;
@@ -111,36 +104,25 @@ public:
for (unsigned int multi=multi_0; multi<=multi_end; multi++) {
const unsigned int n_start = (multi==multi_0) ? n_0 : 0;
- const unsigned int n_end = (multi==multi_end) ? n_max : _Nsize;
+ const unsigned int n_end = (multi==multi_end) ? n_max : _args._Nsize;
if (n_end <= n_start)
continue;
- for (unsigned int m0=0; m0<_Ksize; m0+=m_block) {
- unsigned int mmax = std::min(m0 + m_block, _Ksize);
+ for (unsigned int k0=0; k0<_args._Ksize; k0+=k_block) {
+ unsigned int kmax = std::min(k0 + k_block, _args._Ksize);
for (unsigned int n=n_start; n<n_end; n+=n_block) {
unsigned int nmax = std::min(n + n_block, n_end);
#ifdef CYCLE_PROFILING
- auto p = prof.ScopedProfiler(PROFILE_KERNEL, (mmax-m0) * (nmax-n));
+ auto p = prof.ScopedProfiler(PROFILE_KERNEL, (kmax-k0) * (nmax-n));
#endif
- /* This assumes that the underlying call was a GEMM with M=1; for the N=1 case we would have to pick up this->_Bptr below instead */
- strat.kernel(_A_pretransposed + (multi * _buffer_per_multi) + (n * _Ksize) + (m0 * strategy::A_interleave()),
- (_Ksize * strategy::A_interleave()),
- this->_Aptr + (multi * this->_A_multi_stride) + m0,
+ strat.kernel(this->_Aptr + (multi * this->_A_multi_stride) + k0,
+ _B_pretransposed + (multi * _buffer_per_multi) + (n * roundup(_args._Ksize, strategy::k_unroll())) + (k0 * strategy::out_width()),
this->_Cptr + (multi * this->_C_multi_stride) + n,
- static_cast<Tr>(0), (mmax-m0), (nmax-n));
-
- // Handle activation separately for now
- if (this->_bias) {
- activator<true>(this->_Cptr + (multi * this->_C_multi_stride) + n, 0,
- this->_bias + (multi * this->_bias_multi_stride) + n,
- _act, 1, (nmax-n));
- } else {
- activator<false>(this->_Cptr + (multi * this->_C_multi_stride) + n, 0,
- static_cast<const Tr *>(nullptr),
- _act, 1, (nmax-n));
- }
+ (nmax - n), (kmax-k0),
+ this->_bias ? this->_bias + (multi * this->_bias_multi_stride) + n : nullptr,
+ _args._act, (k0 != 0));
}
}
}
@@ -152,33 +134,27 @@ public:
}
bool B_pretranspose_required() const override {
- /* Transpose is required if _A_pretransposed is still nullptr */
- return (_A_pretransposed == nullptr);
+ /* Transpose is required if _B_pretransposed is still nullptr */
+ return (_B_pretransposed == nullptr);
}
size_t get_B_pretransposed_array_size() const override {
- return _buffer_per_multi * _nmultis * sizeof(To);
+ return _buffer_per_multi * _args._nmulti * sizeof(To);
}
void pretranspose_B_array(void *buffer, const To *B, const int ldb, const int B_multi_stride) override {
- Toi *A_buffer = reinterpret_cast<Toi *>(buffer);
-
- for (unsigned int multi=0; multi<_nmultis; multi++) {
- /* Reverse sense here as we are dealing with B rather than A. So if
- * strategy::A_transpose is false and _trB is false, we still
- * transpose. */
- if (strategy::A_transpose()) {
- Transform<strategy::A_interleave(), strategy::A_block(), false>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
- } else {
- Transform<strategy::A_interleave(), strategy::A_block(), true>(A_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _Nsize, 0, _Ksize);
- }
+ Toi *B_buffer = reinterpret_cast<Toi *>(buffer);
+ strategy strat(_args._ci);
+
+ for (unsigned int multi=0; multi<_args._nmulti; multi++) {
+ strat.transforms.PrepareB(B_buffer + (multi * _buffer_per_multi), B + (multi * B_multi_stride), ldb, 0, _args._Nsize, 0, _args._Ksize);
}
- _A_pretransposed = A_buffer;
+ _B_pretransposed = B_buffer;
}
void set_pretransposed_B_data(void *buffer) override {
- _A_pretransposed = reinterpret_cast<Toi *>(buffer);
+ _B_pretransposed = reinterpret_cast<Toi *>(buffer);
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
new file mode 100644
index 0000000000..807511f0d2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a32_interleave6_block1_fp32_fp32.hpp
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2017-2018 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#ifdef __arm__
+
+#include <arm_neon.h>
+
+#include "../asmlib.hpp"
+
+template<>
+void interleave_block<6, 1, VLType::None, false>(
+ float * &outptr, const float * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ const float *inptr0 = in[0] + row_offset;
+ const float *inptr1 = in[1] + row_offset;
+ const float *inptr2 = in[2] + row_offset;
+ const float *inptr3 = in[3] + row_offset;
+ const float *inptr4 = in[4] + row_offset;
+ const float *inptr5 = in[5] + row_offset;
+
+ // Cope with ragged cases by aliasing the first row (which is always valid).
+ // The nonsense output produced will be suppressed later anyway.
+ switch (height) {
+ case 1:
+ inptr1 = inptr0;
+ // fall through
+ case 2:
+ inptr2 = inptr0;
+ // fall through
+ case 3:
+ inptr3 = inptr0;
+ // fall through
+ case 4:
+ inptr4 = inptr0;
+ // fall through
+ case 5:
+ inptr5 = inptr0;
+ // fall through
+ default:
+ case 6:
+ break;
+ }
+
+ //prefetch_2x(inptr0);
+ //prefetch_2x(inptr1);
+ //prefetch_2x(inptr2);
+ //prefetch_2x(inptr3);
+ //prefetch_2x(inptr4);
+ //prefetch_2x(inptr5);
+
+ for (;width>7;width-=8) {
+ __asm __volatile (
+ // Load up 8 elements (2 vectors) from each of 8 sources.
+ "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
+ "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
+ "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
+ "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
+ "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
+ "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
+ "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
+ "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
+ "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
+ ASM_PREFETCH("[%[inptr0], #128]")
+ "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
+
+ // Store first elements
+ "VST1.32 {d0-d1}, [%[outptr]]!\n"
+ "VST1.32 {d16}, [%[outptr]]!\n"
+
+ "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
+
+ // Store second elements
+ "VST1.32 {d4-d5}, [%[outptr]]!\n"
+ "VZIP.32 q1, q5\n"
+ ASM_PREFETCH("[%[inptr1], #128]")
+ "VST1.32 {d17}, [%[outptr]]!\n"
+ "VZIP.32 q3, q7\n"
+
+ // Store third elements
+ "VZIP.32 q9, q11\n"
+ "VST1.32 {d8-d9}, [%[outptr]]!\n"
+ "VZIP.32 q1, q3\n"
+ ASM_PREFETCH("[%[inptr2], #128]")
+ "VST1.32 {d20}, [%[outptr]]!\n"
+
+ // Store fourth elements
+ "VZIP.32 q5, q7\n"
+ "VST1.32 {d12-d13}, [%[outptr]]!\n"
+ ASM_PREFETCH("[%[inptr3], #128]")
+ "VST1.32 {d21}, [%[outptr]]!\n"
+
+ // Fifth
+ "VST1.32 {d2-d3}, [%[outptr]]!\n"
+ ASM_PREFETCH("[%[inptr4], #128]")
+ "VST1.32 {d18}, [%[outptr]]!\n"
+
+ // Sixth
+ "VST1.32 {d6-d7}, [%[outptr]]!\n"
+ ASM_PREFETCH("[%[inptr5], #128]")
+ "VST1.32 {d19}, [%[outptr]]!\n"
+
+ // Seventh
+ "VST1.32 {d10-d11}, [%[outptr]]!\n"
+ "VST1.32 {d22}, [%[outptr]]!\n"
+
+ // Eighth
+ "VST1.32 {d14-d15}, [%[outptr]]!\n"
+ "VST1.32 {d23}, [%[outptr]]!\n"
+
+ : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
+ [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
+ :
+ : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
+ );
+ }
+
+ for (;width>0;width--) {
+ *outptr++ = *inptr0++;
+ *outptr++ = *inptr1++;
+ *outptr++ = *inptr2++;
+ *outptr++ = *inptr3++;
+ *outptr++ = *inptr4++;
+ *outptr++ = *inptr5++;
+ }
+}
+
+#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
new file mode 100644
index 0000000000..8054c2b96b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8.hpp
@@ -0,0 +1,193 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, false>(
+ int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x22, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x4\n"
+ "ldr x21, [%x[in], #0x8]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x10]\n"
+ "ldr x19, [%x[in], #0x18]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "add x19, x19, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x19, x22\n"
+ "cmp %x[height], #0x2\n"
+ "csel x21, x21, x22, GE\n"
+ "csel x20, x20, x22, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "cmp %x[width], #0x10\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x19, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "prfm pldl1keep, [x19, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q19, [x22], #0x10\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q18, [x21], #0x10\n"
+ "ldr q17, [x20], #0x10\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x19], #0x10\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "prfm pldl1keep, [x19, #0x70]\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 12f\n"
+ "tbz %x[width], #3, 7f\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
+ "ldr d17, [x20], #0x8\n"
+ "ldr d16, [x19], #0x8\n"
+ "tbz %x[width], #2, 5f\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "ld1 { v18.s }[2], [x21], #0x4\n"
+ "ld1 { v17.s }[2], [x20], #0x4\n"
+ "ld1 { v16.s }[2], [x19], #0x4\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v19.h }[6], [x22], #0x2\n"
+ "ld1 { v18.h }[6], [x21], #0x2\n"
+ "ld1 { v17.h }[6], [x20], #0x2\n"
+ "ld1 { v16.h }[6], [x19], #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v19.b }[14], [x22]\n"
+ "ld1 { v18.b }[14], [x21]\n"
+ "ld1 { v17.b }[14], [x20]\n"
+ "ld1 { v16.b }[14], [x19]\n"
+ "b 11f\n"
+ "4:" // odd_loads_1_12
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v19.b }[12], [x22]\n"
+ "ld1 { v18.b }[12], [x21]\n"
+ "ld1 { v17.b }[12], [x20]\n"
+ "ld1 { v16.b }[12], [x19]\n"
+ "b 11f\n"
+ "5:" // odd_loads_2_8
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v19.h }[4], [x22], #0x2\n"
+ "ld1 { v18.h }[4], [x21], #0x2\n"
+ "ld1 { v17.h }[4], [x20], #0x2\n"
+ "ld1 { v16.h }[4], [x19], #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v19.b }[10], [x22]\n"
+ "ld1 { v18.b }[10], [x21]\n"
+ "ld1 { v17.b }[10], [x20]\n"
+ "ld1 { v16.b }[10], [x19]\n"
+ "b 11f\n"
+ "6:" // odd_loads_1_8
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v19.b }[8], [x22]\n"
+ "ld1 { v18.b }[8], [x21]\n"
+ "ld1 { v17.b }[8], [x20]\n"
+ "ld1 { v16.b }[8], [x19]\n"
+ "b 11f\n"
+ "7:" // odd_loads_4_0
+ "tbz %x[width], #2, 9f\n"
+ "ldr s19, [x22], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
+ "ldr s17, [x20], #0x4\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v18.h }[2], [x21], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x19], #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v18.b }[6], [x21]\n"
+ "ld1 { v17.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x19]\n"
+ "b 11f\n"
+ "8:" // odd_loads_1_4
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v18.b }[4], [x21]\n"
+ "ld1 { v17.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x19]\n"
+ "b 11f\n"
+ "9:" // odd_loads_2_0
+ "tbz %x[width], #1, 10f\n"
+ "ldr h19, [x22], #0x2\n"
+ "ldr h18, [x21], #0x2\n"
+ "ldr h17, [x20], #0x2\n"
+ "ldr h16, [x19], #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v18.b }[2], [x21]\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x19]\n"
+ "b 11f\n"
+ "10:" // odd_loads_1_0
+ "ldr b19, [x22, #0x0]\n"
+ "ldr b18, [x21, #0x0]\n"
+ "ldr b17, [x20, #0x0]\n"
+ "ldr b16, [x19, #0x0]\n"
+ "11:" // Odd load end
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "12:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "x19", "x20", "x21", "x22"
+ );
+}
+
+template<>
+void interleave_block<4, 16, VLType::None, false>(
+ uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+ const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+ interleave_block<4, 16, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
new file mode 100644
index 0000000000..1650916f9f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_s8_s8_summing.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, true>(
+ int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v28.8h, #0x0\n"
+ "ldr x23, [%x[in], #0x0]\n"
+ "mov x22, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "ldr x21, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x4\n"
+ "movi v26.8h, #0x0\n"
+ "ldr x20, [%x[in], #0x10]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "movi v25.8h, #0x0\n"
+ "ldr x19, [%x[in], #0x18]\n"
+ "movi v24.4s, #0x0\n"
+ "add x21, x21, %x[row_offset]\n"
+ "movi v23.4s, #0x0\n"
+ "add x20, x20, %x[row_offset]\n"
+ "movi v22.4s, #0x0\n"
+ "add x19, x19, %x[row_offset]\n"
+ "movi v21.4s, #0x0\n"
+ "beq 1f\n"
+ "mov x19, x23\n"
+ "cmp %x[height], #0x2\n"
+ "csel x21, x21, x23, GE\n"
+ "csel x20, x20, x23, GT\n"
+ "1:" // no_pointer_adj
+ "movi v20.4s, #0x0\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x19, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "prfm pldl1keep, [x19, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x10\n"
+ "ld1 { v20.4s }, [%x[out_ptr]]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x10\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x22, #0x7e\n"
+ "ble 4f\n"
+ "sadalp v24.4s, v28.8h\n"
+ "movi v28.8h, #0x0\n"
+ "sadalp v23.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "sadalp v22.4s, v26.8h\n"
+ "movi v26.8h, #0x0\n"
+ "sadalp v21.4s, v25.8h\n"
+ "movi v25.8h, #0x0\n"
+ "mov x22, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q19, [x23], #0x10\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q18, [x21], #0x10\n"
+ "ldr q17, [x20], #0x10\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x19], #0x10\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "sadalp v28.8h, v19.16b\n"
+ "prfm pldl1keep, [x19, #0x70]\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "sadalp v27.8h, v18.16b\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "sadalp v26.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "sadalp v25.8h, v16.16b\n"
+ "add x22, x22, #0x1\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 14f\n"
+ "tbz %x[width], #3, 9f\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
+ "ldr d17, [x20], #0x8\n"
+ "ldr d16, [x19], #0x8\n"
+ "tbz %x[width], #2, 7f\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x21], #0x4\n"
+ "ld1 { v17.s }[2], [x20], #0x4\n"
+ "ld1 { v16.s }[2], [x19], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v19.h }[6], [x23], #0x2\n"
+ "ld1 { v18.h }[6], [x21], #0x2\n"
+ "ld1 { v17.h }[6], [x20], #0x2\n"
+ "ld1 { v16.h }[6], [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[14], [x23]\n"
+ "ld1 { v18.b }[14], [x21]\n"
+ "ld1 { v17.b }[14], [x20]\n"
+ "ld1 { v16.b }[14], [x19]\n"
+ "b 13f\n"
+ "6:" // odd_loads_1_12
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[12], [x23]\n"
+ "ld1 { v18.b }[12], [x21]\n"
+ "ld1 { v17.b }[12], [x20]\n"
+ "ld1 { v16.b }[12], [x19]\n"
+ "b 13f\n"
+ "7:" // odd_loads_2_8
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v19.h }[4], [x23], #0x2\n"
+ "ld1 { v18.h }[4], [x21], #0x2\n"
+ "ld1 { v17.h }[4], [x20], #0x2\n"
+ "ld1 { v16.h }[4], [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[10], [x23]\n"
+ "ld1 { v18.b }[10], [x21]\n"
+ "ld1 { v17.b }[10], [x20]\n"
+ "ld1 { v16.b }[10], [x19]\n"
+ "b 13f\n"
+ "8:" // odd_loads_1_8
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[8], [x23]\n"
+ "ld1 { v18.b }[8], [x21]\n"
+ "ld1 { v17.b }[8], [x20]\n"
+ "ld1 { v16.b }[8], [x19]\n"
+ "b 13f\n"
+ "9:" // odd_loads_4_0
+ "tbz %x[width], #2, 11f\n"
+ "ldr s19, [x23], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
+ "ldr s17, [x20], #0x4\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[width], #1, 10f\n"
+ "ld1 { v19.h }[2], [x23], #0x2\n"
+ "ld1 { v18.h }[2], [x21], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[6], [x23]\n"
+ "ld1 { v18.b }[6], [x21]\n"
+ "ld1 { v17.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x19]\n"
+ "b 13f\n"
+ "10:" // odd_loads_1_4
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[4], [x23]\n"
+ "ld1 { v18.b }[4], [x21]\n"
+ "ld1 { v17.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x19]\n"
+ "b 13f\n"
+ "11:" // odd_loads_2_0
+ "tbz %x[width], #1, 12f\n"
+ "ldr h19, [x23], #0x2\n"
+ "ldr h18, [x21], #0x2\n"
+ "ldr h17, [x20], #0x2\n"
+ "ldr h16, [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[2], [x23]\n"
+ "ld1 { v18.b }[2], [x21]\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x19]\n"
+ "b 13f\n"
+ "12:" // odd_loads_1_0
+ "ldr b19, [x23, #0x0]\n"
+ "ldr b18, [x21, #0x0]\n"
+ "ldr b17, [x20, #0x0]\n"
+ "ldr b16, [x19, #0x0]\n"
+ "13:" // Odd load end
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "sadalp v28.8h, v19.16b\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "sadalp v27.8h, v18.16b\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "sadalp v26.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "sadalp v25.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "14:" // Odds skip
+ "sadalp v24.4s, v28.8h\n"
+ "sadalp v23.4s, v27.8h\n"
+ "addp v24.4s, v24.4s, v23.4s\n"
+ "sadalp v22.4s, v26.8h\n"
+ "sadalp v21.4s, v25.8h\n"
+ "addp v23.4s, v22.4s, v21.4s\n"
+ "addp v24.4s, v24.4s, v23.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
new file mode 100644
index 0000000000..af3efb25b2
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave4_block16_u8_u8_summing.hpp
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<4, 16, VLType::None, true>(
+ uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v28.8h, #0x0\n"
+ "ldr x23, [%x[in], #0x0]\n"
+ "mov x22, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "ldr x21, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x4\n"
+ "movi v26.8h, #0x0\n"
+ "ldr x20, [%x[in], #0x10]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "movi v25.8h, #0x0\n"
+ "ldr x19, [%x[in], #0x18]\n"
+ "movi v24.4s, #0x0\n"
+ "add x21, x21, %x[row_offset]\n"
+ "movi v23.4s, #0x0\n"
+ "add x20, x20, %x[row_offset]\n"
+ "movi v22.4s, #0x0\n"
+ "add x19, x19, %x[row_offset]\n"
+ "movi v21.4s, #0x0\n"
+ "beq 1f\n"
+ "mov x19, x23\n"
+ "cmp %x[height], #0x2\n"
+ "csel x21, x21, x23, GE\n"
+ "csel x20, x20, x23, GT\n"
+ "1:" // no_pointer_adj
+ "movi v20.4s, #0x0\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x19, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "prfm pldl1keep, [x19, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x10\n"
+ "ld1 { v20.4s }, [%x[out_ptr]]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x10\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x22, #0x7e\n"
+ "ble 4f\n"
+ "uadalp v24.4s, v28.8h\n"
+ "movi v28.8h, #0x0\n"
+ "uadalp v23.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "uadalp v22.4s, v26.8h\n"
+ "movi v26.8h, #0x0\n"
+ "uadalp v21.4s, v25.8h\n"
+ "movi v25.8h, #0x0\n"
+ "mov x22, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q19, [x23], #0x10\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q18, [x21], #0x10\n"
+ "ldr q17, [x20], #0x10\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x19], #0x10\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "uadalp v28.8h, v19.16b\n"
+ "prfm pldl1keep, [x19, #0x70]\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "uadalp v27.8h, v18.16b\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "uadalp v26.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "uadalp v25.8h, v16.16b\n"
+ "add x22, x22, #0x1\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 14f\n"
+ "tbz %x[width], #3, 9f\n"
+ "ldr d19, [x23], #0x8\n"
+ "ldr d18, [x21], #0x8\n"
+ "ldr d17, [x20], #0x8\n"
+ "ldr d16, [x19], #0x8\n"
+ "tbz %x[width], #2, 7f\n"
+ "ld1 { v19.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x21], #0x4\n"
+ "ld1 { v17.s }[2], [x20], #0x4\n"
+ "ld1 { v16.s }[2], [x19], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v19.h }[6], [x23], #0x2\n"
+ "ld1 { v18.h }[6], [x21], #0x2\n"
+ "ld1 { v17.h }[6], [x20], #0x2\n"
+ "ld1 { v16.h }[6], [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[14], [x23]\n"
+ "ld1 { v18.b }[14], [x21]\n"
+ "ld1 { v17.b }[14], [x20]\n"
+ "ld1 { v16.b }[14], [x19]\n"
+ "b 13f\n"
+ "6:" // odd_loads_1_12
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[12], [x23]\n"
+ "ld1 { v18.b }[12], [x21]\n"
+ "ld1 { v17.b }[12], [x20]\n"
+ "ld1 { v16.b }[12], [x19]\n"
+ "b 13f\n"
+ "7:" // odd_loads_2_8
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v19.h }[4], [x23], #0x2\n"
+ "ld1 { v18.h }[4], [x21], #0x2\n"
+ "ld1 { v17.h }[4], [x20], #0x2\n"
+ "ld1 { v16.h }[4], [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[10], [x23]\n"
+ "ld1 { v18.b }[10], [x21]\n"
+ "ld1 { v17.b }[10], [x20]\n"
+ "ld1 { v16.b }[10], [x19]\n"
+ "b 13f\n"
+ "8:" // odd_loads_1_8
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[8], [x23]\n"
+ "ld1 { v18.b }[8], [x21]\n"
+ "ld1 { v17.b }[8], [x20]\n"
+ "ld1 { v16.b }[8], [x19]\n"
+ "b 13f\n"
+ "9:" // odd_loads_4_0
+ "tbz %x[width], #2, 11f\n"
+ "ldr s19, [x23], #0x4\n"
+ "ldr s18, [x21], #0x4\n"
+ "ldr s17, [x20], #0x4\n"
+ "ldr s16, [x19], #0x4\n"
+ "tbz %x[width], #1, 10f\n"
+ "ld1 { v19.h }[2], [x23], #0x2\n"
+ "ld1 { v18.h }[2], [x21], #0x2\n"
+ "ld1 { v17.h }[2], [x20], #0x2\n"
+ "ld1 { v16.h }[2], [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[6], [x23]\n"
+ "ld1 { v18.b }[6], [x21]\n"
+ "ld1 { v17.b }[6], [x20]\n"
+ "ld1 { v16.b }[6], [x19]\n"
+ "b 13f\n"
+ "10:" // odd_loads_1_4
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[4], [x23]\n"
+ "ld1 { v18.b }[4], [x21]\n"
+ "ld1 { v17.b }[4], [x20]\n"
+ "ld1 { v16.b }[4], [x19]\n"
+ "b 13f\n"
+ "11:" // odd_loads_2_0
+ "tbz %x[width], #1, 12f\n"
+ "ldr h19, [x23], #0x2\n"
+ "ldr h18, [x21], #0x2\n"
+ "ldr h17, [x20], #0x2\n"
+ "ldr h16, [x19], #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v19.b }[2], [x23]\n"
+ "ld1 { v18.b }[2], [x21]\n"
+ "ld1 { v17.b }[2], [x20]\n"
+ "ld1 { v16.b }[2], [x19]\n"
+ "b 13f\n"
+ "12:" // odd_loads_1_0
+ "ldr b19, [x23, #0x0]\n"
+ "ldr b18, [x21, #0x0]\n"
+ "ldr b17, [x20, #0x0]\n"
+ "ldr b16, [x19, #0x0]\n"
+ "13:" // Odd load end
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "uadalp v28.8h, v19.16b\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "uadalp v27.8h, v18.16b\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "uadalp v26.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "uadalp v25.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "14:" // Odds skip
+ "uadalp v24.4s, v28.8h\n"
+ "uadalp v23.4s, v27.8h\n"
+ "addp v24.4s, v24.4s, v23.4s\n"
+ "uadalp v22.4s, v26.8h\n"
+ "uadalp v21.4s, v25.8h\n"
+ "addp v23.4s, v22.4s, v21.4s\n"
+ "addp v24.4s, v24.4s, v23.4s\n"
+ "add v24.4s, v24.4s, v20.4s\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
new file mode 100644
index 0000000000..34d25f27b8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_bf16_fp32.hpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ float * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "movi v29.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x4\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr d28, [x27], #0x8\n"
+ "zip1 v28.8h, v29.8h, v28.8h\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr d27, [x26], #0x8\n"
+ "zip1 v27.8h, v29.8h, v27.8h\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr d26, [x25], #0x8\n"
+ "zip1 v26.8h, v29.8h, v26.8h\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr d25, [x24], #0x8\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "zip1 v25.8h, v29.8h, v25.8h\n"
+ "ldr d24, [x23], #0x8\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "zip1 v24.8h, v29.8h, v24.8h\n"
+ "ldr d23, [x22], #0x8\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "zip1 v23.8h, v29.8h, v23.8h\n"
+ "ldr d22, [x21], #0x8\n"
+ "zip2 v19.4s, v20.4s, v19.4s\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "zip1 v22.8h, v29.8h, v22.8h\n"
+ "ldr d21, [x20], #0x8\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v21.8h, v29.8h, v21.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "subs %x[width], %x[width], #0x4\n"
+ "zip2 v20.4s, v28.4s, v26.4s\n"
+ "cmp %x[width], #0x4\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q19, [%x[out_ptr], #0x20]\n"
+ "zip2 v19.4s, v27.4s, v25.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 6f\n"
+ "tbz %x[width], #1, 4f\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s27, [x26], #0x4\n"
+ "ldr s26, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s23, [x22], #0x4\n"
+ "ldr s22, [x21], #0x4\n"
+ "ldr s21, [x20], #0x4\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 5f\n"
+ "ld1 { v28.h }[2], [x27]\n"
+ "ld1 { v27.h }[2], [x26]\n"
+ "ld1 { v26.h }[2], [x25]\n"
+ "ld1 { v25.h }[2], [x24]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v23.h }[2], [x22]\n"
+ "ld1 { v22.h }[2], [x21]\n"
+ "ld1 { v21.h }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 5f\n"
+ "4:" // odd_loads_1_0
+ "ldr h28, [x27, #0x0]\n"
+ "ldr h27, [x26, #0x0]\n"
+ "ldr h26, [x25, #0x0]\n"
+ "ldr h25, [x24, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "ldr h23, [x22, #0x0]\n"
+ "ldr h22, [x21, #0x0]\n"
+ "ldr h21, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "5:" // Odd load end
+ "zip1 v28.8h, v29.8h, v28.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v27.8h, v29.8h, v27.8h\n"
+ "zip1 v26.8h, v29.8h, v26.8h\n"
+ "zip1 v25.8h, v29.8h, v25.8h\n"
+ "zip1 v24.8h, v29.8h, v24.8h\n"
+ "zip1 v23.8h, v29.8h, v23.8h\n"
+ "zip1 v22.8h, v29.8h, v22.8h\n"
+ "zip1 v21.8h, v29.8h, v21.8h\n"
+ "zip1 v20.4s, v28.4s, v26.4s\n"
+ "zip1 v19.4s, v27.4s, v25.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v24.4s, v22.4s\n"
+ "zip1 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 6f\n"
+ "zip2 v19.4s, v20.4s, v19.4s\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q19, [%x[out_ptr], #0x0]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "subs x19, x19, #0x1\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 6f\n"
+ "zip2 v20.4s, v28.4s, v26.4s\n"
+ "zip2 v19.4s, v27.4s, v25.4s\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.4s, v24.4s, v22.4s\n"
+ "zip2 v17.4s, v23.4s, v21.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "6:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
new file mode 100644
index 0000000000..d547957129
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp16.hpp
@@ -0,0 +1,270 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ __fp16 * &out_ptr, const __fp16 * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x8\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q30, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q29, [x26], #0x10\n"
+ "ldr q28, [x25], #0x10\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q27, [x24], #0x10\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q24, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q25, [x22], #0x10\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q23, [x21], #0x10\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q22, [x20], #0x10\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 8f\n"
+ "tbz %x[width], #2, 5f\n"
+ "ldr d30, [x27], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x24], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v23.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v29.h }[6], [x26]\n"
+ "ld1 { v28.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x24]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v23.h }[6], [x21]\n"
+ "ld1 { v22.h }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 7f\n"
+ "4:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v29.h }[4], [x26]\n"
+ "ld1 { v28.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x24]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v23.h }[4], [x21]\n"
+ "ld1 { v22.h }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 7f\n"
+ "5:" // odd_loads_2_0
+ "tbz %x[width], #1, 6f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v29.h }[2], [x26]\n"
+ "ld1 { v28.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x24]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v23.h }[2], [x21]\n"
+ "ld1 { v22.h }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 7f\n"
+ "6:" // odd_loads_1_0
+ "ldr h30, [x27, #0x0]\n"
+ "ldr h29, [x26, #0x0]\n"
+ "ldr h28, [x25, #0x0]\n"
+ "ldr h27, [x24, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h23, [x21, #0x0]\n"
+ "ldr h22, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "7:" // Odd load end
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "8:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
new file mode 100644
index 0000000000..b45e622a47
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp16_fp32.hpp
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ float * &out_ptr, const __fp16 * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x4\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr d29, [x27], #0x8\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr d28, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr d26, [x24], #0x8\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "fcvtl v29.4s, v29.4h\n"
+ "fcvtl v28.4s, v28.4h\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "fcvtl v27.4s, v27.4h\n"
+ "zip1 v20.4s, v29.4s, v27.4s\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "fcvtl v26.4s, v26.4h\n"
+ "zip2 v18.4s, v29.4s, v27.4s\n"
+ "fcvtl v25.4s, v25.4h\n"
+ "fcvtl v24.4s, v24.4h\n"
+ "zip1 v19.4s, v28.4s, v26.4s\n"
+ "fcvtl v23.4s, v23.4h\n"
+ "zip2 v17.4s, v28.4s, v26.4s\n"
+ "fcvtl v22.4s, v22.4h\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v21.4s, v20.4s, v19.4s\n"
+ "subs %x[width], %x[width], #0x4\n"
+ "zip1 v20.4s, v18.4s, v17.4s\n"
+ "cmp %x[width], #0x4\n"
+ "zip2 v19.4s, v18.4s, v17.4s\n"
+ "zip1 v18.4s, v25.4s, v23.4s\n"
+ "zip1 v17.4s, v24.4s, v22.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q21, [%x[out_ptr], #0x20]\n"
+ "zip2 v18.4s, v25.4s, v23.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v17.4s, v24.4s, v22.4s\n"
+ "str q20, [%x[out_ptr], #0x40]\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q19, [%x[out_ptr], #0x60]\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 6f\n"
+ "tbz %x[width], #1, 4f\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s26, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 5f\n"
+ "ld1 { v29.h }[2], [x27]\n"
+ "ld1 { v28.h }[2], [x26]\n"
+ "ld1 { v27.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x24]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v23.h }[2], [x21]\n"
+ "ld1 { v22.h }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 5f\n"
+ "4:" // odd_loads_1_0
+ "ldr h29, [x27, #0x0]\n"
+ "ldr h28, [x26, #0x0]\n"
+ "ldr h27, [x25, #0x0]\n"
+ "ldr h26, [x24, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h23, [x21, #0x0]\n"
+ "ldr h22, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "5:" // Odd load end
+ "fcvtl v29.4s, v29.4h\n"
+ "fcvtl v28.4s, v28.4h\n"
+ "fcvtl v27.4s, v27.4h\n"
+ "zip1 v20.4s, v29.4s, v27.4s\n"
+ "fcvtl v26.4s, v26.4h\n"
+ "fcvtl v25.4s, v25.4h\n"
+ "zip1 v19.4s, v28.4s, v26.4s\n"
+ "fcvtl v24.4s, v24.4h\n"
+ "fcvtl v23.4s, v23.4h\n"
+ "zip1 v16.4s, v20.4s, v19.4s\n"
+ "fcvtl v22.4s, v22.4h\n"
+ "zip1 v18.4s, v25.4s, v23.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v17.4s, v24.4s, v22.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 6f\n"
+ "zip2 v21.4s, v20.4s, v19.4s\n"
+ "zip2 v16.4s, v18.4s, v17.4s\n"
+ "str q21, [%x[out_ptr], #0x0]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "subs x19, x19, #0x1\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 6f\n"
+ "zip2 v18.4s, v29.4s, v27.4s\n"
+ "zip2 v17.4s, v28.4s, v26.4s\n"
+ "zip1 v20.4s, v18.4s, v17.4s\n"
+ "str q20, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.4s, v25.4s, v23.4s\n"
+ "zip2 v17.4s, v24.4s, v22.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "6:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
new file mode 100644
index 0000000000..3f38859c1c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_fp32_fp32.hpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ float * &out_ptr, const float * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #2\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #2\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #2\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #2\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #2\n"
+ "add x22, x22, %x[row_offset], LSL #2\n"
+ "add x21, x21, %x[row_offset], LSL #2\n"
+ "add x20, x20, %x[row_offset], LSL #2\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x4\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q28, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q27, [x26], #0x10\n"
+ "ldr q26, [x25], #0x10\n"
+ "zip1 v23.4s, v28.4s, v26.4s\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q22, [x24], #0x10\n"
+ "zip2 v26.4s, v28.4s, v26.4s\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v20.4s, v27.4s, v22.4s\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q24, [x22], #0x10\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v23.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "ldr q21, [x20], #0x10\n"
+ "zip1 v18.4s, v25.4s, v19.4s\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.4s, v26.4s, v22.4s\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v16.4s, v24.4s, v21.4s\n"
+ "subs %x[width], %x[width], #0x4\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "cmp %x[width], #0x4\n"
+ "zip2 v16.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v19.4s, v25.4s, v19.4s\n"
+ "str q23, [%x[out_ptr], #0x20]\n"
+ "zip2 v18.4s, v24.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q20, [%x[out_ptr], #0x40]\n"
+ "zip2 v17.4s, v26.4s, v22.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 6f\n"
+ "tbz %x[width], #1, 4f\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d27, [x26], #0x8\n"
+ "ldr d26, [x25], #0x8\n"
+ "ldr d22, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d21, [x20], #0x8\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 5f\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v27.s }[2], [x26]\n"
+ "ld1 { v26.s }[2], [x25]\n"
+ "ld1 { v22.s }[2], [x24]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v24.s }[2], [x22]\n"
+ "ld1 { v19.s }[2], [x21]\n"
+ "ld1 { v21.s }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 5f\n"
+ "4:" // odd_loads_1_0
+ "ldr s28, [x27, #0x0]\n"
+ "ldr s27, [x26, #0x0]\n"
+ "ldr s26, [x25, #0x0]\n"
+ "ldr s22, [x24, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s24, [x22, #0x0]\n"
+ "ldr s19, [x21, #0x0]\n"
+ "ldr s21, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "5:" // Odd load end
+ "zip1 v23.4s, v28.4s, v26.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v20.4s, v27.4s, v22.4s\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v25.4s, v19.4s\n"
+ "zip1 v16.4s, v24.4s, v21.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 6f\n"
+ "zip2 v23.4s, v23.4s, v20.4s\n"
+ "zip2 v16.4s, v18.4s, v16.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "subs x19, x19, #0x1\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 6f\n"
+ "zip2 v26.4s, v28.4s, v26.4s\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "zip1 v20.4s, v26.4s, v22.4s\n"
+ "str q20, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v25.4s, v19.4s\n"
+ "zip2 v18.4s, v24.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "6:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
new file mode 100644
index 0000000000..03f552a575
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16.hpp
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x8\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q30, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q29, [x26], #0x10\n"
+ "ldr q28, [x25], #0x10\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q27, [x24], #0x10\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q24, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q25, [x22], #0x10\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q23, [x21], #0x10\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q22, [x20], #0x10\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 8f\n"
+ "tbz %x[width], #2, 5f\n"
+ "ldr d30, [x27], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x24], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v23.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v29.h }[6], [x26]\n"
+ "ld1 { v28.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x24]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v23.h }[6], [x21]\n"
+ "ld1 { v22.h }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 7f\n"
+ "4:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v29.h }[4], [x26]\n"
+ "ld1 { v28.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x24]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v23.h }[4], [x21]\n"
+ "ld1 { v22.h }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 7f\n"
+ "5:" // odd_loads_2_0
+ "tbz %x[width], #1, 6f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v29.h }[2], [x26]\n"
+ "ld1 { v28.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x24]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v23.h }[2], [x21]\n"
+ "ld1 { v22.h }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 7f\n"
+ "6:" // odd_loads_1_0
+ "ldr h30, [x27, #0x0]\n"
+ "ldr h29, [x26, #0x0]\n"
+ "ldr h28, [x25, #0x0]\n"
+ "ldr h27, [x24, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h23, [x21, #0x0]\n"
+ "ldr h22, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "7:" // Odd load end
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v20.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "8:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ int16_t * &out_cast = reinterpret_cast<int16_t * &>(out_ptr);
+ const int16_t * const * in_cast = reinterpret_cast<const int16_t * const *>(in);
+
+ interleave_block<8, 1, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
new file mode 100644
index 0000000000..35c7719de7
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s16_s16_summing.hpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+ int16_t * &out_ptr, const int16_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v1.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v0.4s }, [%x[out_ptr]]\n"
+ "ldr q31, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x8\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0xe\n"
+ "ble 4f\n"
+ "saddw v0.4s, v0.4s, v1.4h\n"
+ "saddw2 v31.4s, v31.4s, v1.8h\n"
+ "mov x19, #0x0\n"
+ "movi v1.8h, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q30, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q29, [x26], #0x10\n"
+ "ldr q28, [x25], #0x10\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q27, [x24], #0x10\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q24, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q25, [x22], #0x10\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q23, [x21], #0x10\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q22, [x20], #0x10\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "add x19, x19, #0x1\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v20.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.8h, v21.8h, v19.8h\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 10f\n"
+ "tbz %x[width], #2, 7f\n"
+ "ldr d30, [x27], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x24], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v23.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v29.h }[6], [x26]\n"
+ "ld1 { v28.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x24]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v23.h }[6], [x21]\n"
+ "ld1 { v22.h }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 9f\n"
+ "6:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v29.h }[4], [x26]\n"
+ "ld1 { v28.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x24]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v23.h }[4], [x21]\n"
+ "ld1 { v22.h }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 9f\n"
+ "7:" // odd_loads_2_0
+ "tbz %x[width], #1, 8f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v29.h }[2], [x26]\n"
+ "ld1 { v28.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x24]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v23.h }[2], [x21]\n"
+ "ld1 { v22.h }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 9f\n"
+ "8:" // odd_loads_1_0
+ "ldr h30, [x27, #0x0]\n"
+ "ldr h29, [x26, #0x0]\n"
+ "ldr h28, [x25, #0x0]\n"
+ "ldr h27, [x24, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h23, [x21, #0x0]\n"
+ "ldr h22, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "9:" // Odd load end
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v17.8h, v20.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip2 v16.8h, v21.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "10:" // Odds skip
+ "saddw v0.4s, v0.4s, v1.4h\n"
+ "str q0, [%x[out_ptr], #0x0]\n"
+ "saddw2 v31.4s, v31.4s, v1.8h\n"
+ "str q31, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
new file mode 100644
index 0000000000..582836fe67
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x8\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr d30, [x27], #0x8\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr d27, [x24], #0x8\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr d26, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "sshll v30.8h, v30.8b, #0x0\n"
+ "sshll v29.8h, v29.8b, #0x0\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "sshll v28.8h, v28.8b, #0x0\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "sshll v21.8h, v21.8b, #0x0\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "sshll v26.8h, v26.8b, #0x0\n"
+ "sshll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 8f\n"
+ "tbz %x[width], #2, 5f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "ldr s25, [x20], #0x4\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v21.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 7f\n"
+ "4:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v21.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 7f\n"
+ "5:" // odd_loads_2_0
+ "tbz %x[width], #1, 6f\n"
+ "ldr h30, [x27], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h27, [x24], #0x2\n"
+ "ldr h23, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h26, [x21], #0x2\n"
+ "ldr h25, [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v21.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 7f\n"
+ "6:" // odd_loads_1_0
+ "ldr b30, [x27, #0x0]\n"
+ "ldr b29, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b27, [x24, #0x0]\n"
+ "ldr b23, [x23, #0x0]\n"
+ "ldr b21, [x22, #0x0]\n"
+ "ldr b26, [x21, #0x0]\n"
+ "ldr b25, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "7:" // Odd load end
+ "sshll v30.8h, v30.8b, #0x0\n"
+ "sshll v29.8h, v29.8b, #0x0\n"
+ "sshll v28.8h, v28.8b, #0x0\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "sshll v21.8h, v21.8b, #0x0\n"
+ "sshll v26.8h, v26.8b, #0x0\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "sshll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "8:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
new file mode 100644
index 0000000000..35dc3dc0d4
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_s8_s16_summing.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+ int16_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v1.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v0.4s }, [%x[out_ptr]]\n"
+ "ldr q31, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x8\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0xe\n"
+ "ble 4f\n"
+ "saddw v0.4s, v0.4s, v1.4h\n"
+ "saddw2 v31.4s, v31.4s, v1.8h\n"
+ "mov x19, #0x0\n"
+ "movi v1.8h, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr d30, [x27], #0x8\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr d27, [x24], #0x8\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr d26, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "sshll v30.8h, v30.8b, #0x0\n"
+ "sshll v29.8h, v29.8b, #0x0\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "sshll v28.8h, v28.8b, #0x0\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "sshll v21.8h, v21.8b, #0x0\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "sshll v26.8h, v26.8b, #0x0\n"
+ "sshll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "add x19, x19, #0x1\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 10f\n"
+ "tbz %x[width], #2, 7f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "ldr s25, [x20], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v21.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 9f\n"
+ "6:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v21.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 9f\n"
+ "7:" // odd_loads_2_0
+ "tbz %x[width], #1, 8f\n"
+ "ldr h30, [x27], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h27, [x24], #0x2\n"
+ "ldr h23, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h26, [x21], #0x2\n"
+ "ldr h25, [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v21.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 9f\n"
+ "8:" // odd_loads_1_0
+ "ldr b30, [x27, #0x0]\n"
+ "ldr b29, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b27, [x24, #0x0]\n"
+ "ldr b23, [x23, #0x0]\n"
+ "ldr b21, [x22, #0x0]\n"
+ "ldr b26, [x21, #0x0]\n"
+ "ldr b25, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "9:" // Odd load end
+ "sshll v30.8h, v30.8b, #0x0\n"
+ "sshll v29.8h, v29.8b, #0x0\n"
+ "sshll v28.8h, v28.8b, #0x0\n"
+ "sshll v27.8h, v27.8b, #0x0\n"
+ "sshll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "sshll v21.8h, v21.8b, #0x0\n"
+ "sshll v26.8h, v26.8b, #0x0\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "sshll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "10:" // Odds skip
+ "saddw v0.4s, v0.4s, v1.4h\n"
+ "str q0, [%x[out_ptr], #0x0]\n"
+ "saddw2 v31.4s, v31.4s, v1.8h\n"
+ "str q31, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
new file mode 100644
index 0000000000..bfa8989a4d
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u16_u16_summing.hpp
@@ -0,0 +1,306 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+ uint16_t * &out_ptr, const uint16_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v1.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v0.4s }, [%x[out_ptr]]\n"
+ "ldr q31, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x8\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0xe\n"
+ "ble 4f\n"
+ "uaddw v0.4s, v0.4s, v1.4h\n"
+ "uaddw2 v31.4s, v31.4s, v1.8h\n"
+ "mov x19, #0x0\n"
+ "movi v1.8h, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q30, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q29, [x26], #0x10\n"
+ "ldr q28, [x25], #0x10\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q27, [x24], #0x10\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q24, [x23], #0x10\n"
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q25, [x22], #0x10\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q23, [x21], #0x10\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q22, [x20], #0x10\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "add x19, x19, #0x1\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v20.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.8h, v21.8h, v19.8h\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 10f\n"
+ "tbz %x[width], #2, 7f\n"
+ "ldr d30, [x27], #0x8\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d27, [x24], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d23, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v30.s }[2], [x27], #0x4\n"
+ "ld1 { v29.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x24], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v23.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.h }[6], [x27]\n"
+ "ld1 { v29.h }[6], [x26]\n"
+ "ld1 { v28.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x24]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v25.h }[6], [x22]\n"
+ "ld1 { v23.h }[6], [x21]\n"
+ "ld1 { v22.h }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 9f\n"
+ "6:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.h }[4], [x27]\n"
+ "ld1 { v29.h }[4], [x26]\n"
+ "ld1 { v28.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x24]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v25.h }[4], [x22]\n"
+ "ld1 { v23.h }[4], [x21]\n"
+ "ld1 { v22.h }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 9f\n"
+ "7:" // odd_loads_2_0
+ "tbz %x[width], #1, 8f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s23, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.h }[2], [x27]\n"
+ "ld1 { v29.h }[2], [x26]\n"
+ "ld1 { v28.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x24]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v25.h }[2], [x22]\n"
+ "ld1 { v23.h }[2], [x21]\n"
+ "ld1 { v22.h }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 9f\n"
+ "8:" // odd_loads_1_0
+ "ldr h30, [x27, #0x0]\n"
+ "ldr h29, [x26, #0x0]\n"
+ "ldr h28, [x25, #0x0]\n"
+ "ldr h27, [x24, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "ldr h25, [x22, #0x0]\n"
+ "ldr h23, [x21, #0x0]\n"
+ "ldr h22, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "9:" // Odd load end
+ "zip1 v26.8h, v30.8h, v24.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v28.8h, v23.8h\n"
+ "zip1 v20.8h, v26.8h, v18.8h\n"
+ "zip1 v21.8h, v29.8h, v25.8h\n"
+ "zip1 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v20.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v17.8h, v20.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v26.8h, v18.8h\n"
+ "zip2 v16.8h, v21.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v17.8h, v18.8h, v16.8h\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v17.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v16.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v24.8h, v30.8h, v24.8h\n"
+ "zip2 v21.8h, v28.8h, v23.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v20.8h, v29.8h, v25.8h\n"
+ "zip2 v19.8h, v27.8h, v22.8h\n"
+ "zip1 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v24.8h, v21.8h\n"
+ "zip2 v17.8h, v20.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "10:" // Odds skip
+ "uaddw v0.4s, v0.4s, v1.4h\n"
+ "str q0, [%x[out_ptr], #0x0]\n"
+ "uaddw2 v31.4s, v31.4s, v1.8h\n"
+ "str q31, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
new file mode 100644
index 0000000000..86b90f1898
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16.hpp
@@ -0,0 +1,286 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, false>(
+ uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x8\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr d30, [x27], #0x8\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr d27, [x24], #0x8\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr d26, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 8f\n"
+ "tbz %x[width], #2, 5f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "ldr s25, [x20], #0x4\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v21.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 7f\n"
+ "4:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v21.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 7f\n"
+ "5:" // odd_loads_2_0
+ "tbz %x[width], #1, 6f\n"
+ "ldr h30, [x27], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h27, [x24], #0x2\n"
+ "ldr h23, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h26, [x21], #0x2\n"
+ "ldr h25, [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v21.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 7f\n"
+ "6:" // odd_loads_1_0
+ "ldr b30, [x27, #0x0]\n"
+ "ldr b29, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b27, [x24, #0x0]\n"
+ "ldr b23, [x23, #0x0]\n"
+ "ldr b21, [x22, #0x0]\n"
+ "ldr b26, [x21, #0x0]\n"
+ "ldr b25, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "7:" // Odd load end
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 8f\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "8:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
new file mode 100644
index 0000000000..cefb70c57b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block1_u8_u16_summing.hpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 1, VLType::None, true>(
+ uint16_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v1.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v0.4s }, [%x[out_ptr]]\n"
+ "ldr q31, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x8\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0xe\n"
+ "ble 4f\n"
+ "uaddw v0.4s, v0.4s, v1.4h\n"
+ "uaddw2 v31.4s, v31.4s, v1.8h\n"
+ "mov x19, #0x0\n"
+ "movi v1.8h, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr d30, [x27], #0x8\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr d29, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr d27, [x24], #0x8\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr d23, [x23], #0x8\n"
+ "ldr d21, [x22], #0x8\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr d26, [x21], #0x8\n"
+ "ldr d25, [x20], #0x8\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "add x19, x19, #0x1\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "cmp %x[width], #0x8\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x20]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x40]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x60]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 10f\n"
+ "tbz %x[width], #2, 7f\n"
+ "ldr s30, [x27], #0x4\n"
+ "ldr s29, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s27, [x24], #0x4\n"
+ "ldr s23, [x23], #0x4\n"
+ "ldr s21, [x22], #0x4\n"
+ "ldr s26, [x21], #0x4\n"
+ "ldr s25, [x20], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v30.h }[2], [x27], #0x2\n"
+ "ld1 { v29.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v27.h }[2], [x24], #0x2\n"
+ "ld1 { v23.h }[2], [x23], #0x2\n"
+ "ld1 { v21.h }[2], [x22], #0x2\n"
+ "ld1 { v26.h }[2], [x21], #0x2\n"
+ "ld1 { v25.h }[2], [x20], #0x2\n"
+ "mov x19, #0x6\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.b }[6], [x27]\n"
+ "ld1 { v29.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v27.b }[6], [x24]\n"
+ "ld1 { v23.b }[6], [x23]\n"
+ "ld1 { v21.b }[6], [x22]\n"
+ "ld1 { v26.b }[6], [x21]\n"
+ "ld1 { v25.b }[6], [x20]\n"
+ "mov x19, #0x7\n"
+ "b 9f\n"
+ "6:" // odd_loads_1_4
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.b }[4], [x27]\n"
+ "ld1 { v29.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v27.b }[4], [x24]\n"
+ "ld1 { v23.b }[4], [x23]\n"
+ "ld1 { v21.b }[4], [x22]\n"
+ "ld1 { v26.b }[4], [x21]\n"
+ "ld1 { v25.b }[4], [x20]\n"
+ "mov x19, #0x5\n"
+ "b 9f\n"
+ "7:" // odd_loads_2_0
+ "tbz %x[width], #1, 8f\n"
+ "ldr h30, [x27], #0x2\n"
+ "ldr h29, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h27, [x24], #0x2\n"
+ "ldr h23, [x23], #0x2\n"
+ "ldr h21, [x22], #0x2\n"
+ "ldr h26, [x21], #0x2\n"
+ "ldr h25, [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 9f\n"
+ "ld1 { v30.b }[2], [x27]\n"
+ "ld1 { v29.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v27.b }[2], [x24]\n"
+ "ld1 { v23.b }[2], [x23]\n"
+ "ld1 { v21.b }[2], [x22]\n"
+ "ld1 { v26.b }[2], [x21]\n"
+ "ld1 { v25.b }[2], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 9f\n"
+ "8:" // odd_loads_1_0
+ "ldr b30, [x27, #0x0]\n"
+ "ldr b29, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b27, [x24, #0x0]\n"
+ "ldr b23, [x23, #0x0]\n"
+ "ldr b21, [x22, #0x0]\n"
+ "ldr b26, [x21, #0x0]\n"
+ "ldr b25, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "9:" // Odd load end
+ "ushll v30.8h, v30.8b, #0x0\n"
+ "ushll v29.8h, v29.8b, #0x0\n"
+ "ushll v28.8h, v28.8b, #0x0\n"
+ "ushll v27.8h, v27.8b, #0x0\n"
+ "ushll v23.8h, v23.8b, #0x0\n"
+ "zip1 v24.8h, v30.8h, v23.8h\n"
+ "ushll v21.8h, v21.8b, #0x0\n"
+ "ushll v26.8h, v26.8b, #0x0\n"
+ "zip1 v20.8h, v28.8h, v26.8h\n"
+ "ushll v25.8h, v25.8b, #0x0\n"
+ "zip1 v22.8h, v29.8h, v21.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v24.8h, v20.8h\n"
+ "zip1 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v22.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v24.8h, v20.8h\n"
+ "zip2 v17.8h, v22.8h, v19.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v23.8h, v30.8h, v23.8h\n"
+ "zip2 v20.8h, v28.8h, v26.8h\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v21.8h, v29.8h, v21.8h\n"
+ "zip2 v19.8h, v27.8h, v25.8h\n"
+ "zip1 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v16.8h, v18.8h, v17.8h\n"
+ "subs x19, x19, #0x1\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "beq 10f\n"
+ "zip2 v18.8h, v23.8h, v20.8h\n"
+ "zip2 v17.8h, v21.8h, v19.8h\n"
+ "zip1 v16.8h, v18.8h, v17.8h\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "add v1.8h, v1.8h, v16.8h\n"
+ "add %x[out_ptr], %x[out_ptr], #0x10\n"
+ "10:" // Odds skip
+ "uaddw v0.4s, v0.4s, v1.4h\n"
+ "str q0, [%x[out_ptr], #0x0]\n"
+ "uaddw2 v31.4s, v31.4s, v1.8h\n"
+ "str q31, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
new file mode 100644
index 0000000000..5377edc1e1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_bf16_bf16.hpp
@@ -0,0 +1,247 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 2, VLType::None, false>(
+ bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x8\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q28, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q27, [x26], #0x10\n"
+ "ldr q26, [x25], #0x10\n"
+ "zip1 v23.4s, v28.4s, v26.4s\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q22, [x24], #0x10\n"
+ "zip2 v26.4s, v28.4s, v26.4s\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v20.4s, v27.4s, v22.4s\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q24, [x22], #0x10\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v23.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "ldr q21, [x20], #0x10\n"
+ "zip1 v18.4s, v25.4s, v19.4s\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.4s, v26.4s, v22.4s\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v16.4s, v24.4s, v21.4s\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "cmp %x[width], #0x8\n"
+ "zip2 v16.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v19.4s, v25.4s, v19.4s\n"
+ "str q23, [%x[out_ptr], #0x20]\n"
+ "zip2 v18.4s, v24.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q20, [%x[out_ptr], #0x40]\n"
+ "zip2 v17.4s, v26.4s, v22.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 8f\n"
+ "tbz %x[width], #2, 5f\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d27, [x26], #0x8\n"
+ "ldr d26, [x25], #0x8\n"
+ "ldr d22, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d21, [x20], #0x8\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v27.s }[2], [x26], #0x4\n"
+ "ld1 { v26.s }[2], [x25], #0x4\n"
+ "ld1 { v22.s }[2], [x24], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v21.s }[2], [x20], #0x4\n"
+ "mov x19, #0x3\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v28.h }[6], [x27]\n"
+ "ld1 { v27.h }[6], [x26]\n"
+ "ld1 { v26.h }[6], [x25]\n"
+ "ld1 { v22.h }[6], [x24]\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v24.h }[6], [x22]\n"
+ "ld1 { v19.h }[6], [x21]\n"
+ "ld1 { v21.h }[6], [x20]\n"
+ "mov x19, #0x4\n"
+ "b 7f\n"
+ "4:" // odd_loads_1_4
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v28.h }[4], [x27]\n"
+ "ld1 { v27.h }[4], [x26]\n"
+ "ld1 { v26.h }[4], [x25]\n"
+ "ld1 { v22.h }[4], [x24]\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v24.h }[4], [x22]\n"
+ "ld1 { v19.h }[4], [x21]\n"
+ "ld1 { v21.h }[4], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 7f\n"
+ "5:" // odd_loads_2_0
+ "tbz %x[width], #1, 6f\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s27, [x26], #0x4\n"
+ "ldr s26, [x25], #0x4\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s21, [x20], #0x4\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v28.h }[2], [x27]\n"
+ "ld1 { v27.h }[2], [x26]\n"
+ "ld1 { v26.h }[2], [x25]\n"
+ "ld1 { v22.h }[2], [x24]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v24.h }[2], [x22]\n"
+ "ld1 { v19.h }[2], [x21]\n"
+ "ld1 { v21.h }[2], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 7f\n"
+ "6:" // odd_loads_1_0
+ "ldr h28, [x27, #0x0]\n"
+ "ldr h27, [x26, #0x0]\n"
+ "ldr h26, [x25, #0x0]\n"
+ "ldr h22, [x24, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h24, [x22, #0x0]\n"
+ "ldr h19, [x21, #0x0]\n"
+ "ldr h21, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "7:" // Odd load end
+ "zip1 v23.4s, v28.4s, v26.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v20.4s, v27.4s, v22.4s\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v25.4s, v19.4s\n"
+ "zip1 v16.4s, v24.4s, v21.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 8f\n"
+ "zip2 v23.4s, v23.4s, v20.4s\n"
+ "zip2 v16.4s, v18.4s, v16.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "subs x19, x19, #0x1\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 8f\n"
+ "zip2 v26.4s, v28.4s, v26.4s\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v20.4s, v26.4s, v22.4s\n"
+ "str q20, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v25.4s, v19.4s\n"
+ "zip2 v18.4s, v24.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 8f\n"
+ "zip2 v17.4s, v26.4s, v22.4s\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "8:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
new file mode 100644
index 0000000000..3aea6a8999
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block2_fp32_fp32.hpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 2, VLType::None, false>(
+ float * &out_ptr, const float * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #2\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #2\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #2\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #2\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #2\n"
+ "add x22, x22, %x[row_offset], LSL #2\n"
+ "add x21, x21, %x[row_offset], LSL #2\n"
+ "add x20, x20, %x[row_offset], LSL #2\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x4\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q27, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q18, [x22], #0x10\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "str q24, [%x[out_ptr], #0x40]\n"
+ "str q21, [%x[out_ptr], #0x50]\n"
+ "str q18, [%x[out_ptr], #0x60]\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "subs %x[width], %x[width], #0x4\n"
+ "cmp %x[width], #0x4\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 6f\n"
+ "tbz %x[width], #1, 4f\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d24, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 5f\n"
+ "ld1 { v27.s }[2], [x27]\n"
+ "ld1 { v24.s }[2], [x26]\n"
+ "ld1 { v25.s }[2], [x25]\n"
+ "ld1 { v21.s }[2], [x24]\n"
+ "ld1 { v22.s }[2], [x23]\n"
+ "ld1 { v18.s }[2], [x22]\n"
+ "ld1 { v19.s }[2], [x21]\n"
+ "ld1 { v16.s }[2], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 5f\n"
+ "4:" // odd_loads_1_0
+ "ldr s27, [x27, #0x0]\n"
+ "ldr s24, [x26, #0x0]\n"
+ "ldr s25, [x25, #0x0]\n"
+ "ldr s21, [x24, #0x0]\n"
+ "ldr s22, [x23, #0x0]\n"
+ "ldr s18, [x22, #0x0]\n"
+ "ldr s19, [x21, #0x0]\n"
+ "ldr s16, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "5:" // Odd load end
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "beq 6f\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "str q21, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q18, [%x[out_ptr], #0x20]\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "6:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
new file mode 100644
index 0000000000..4780b77a4a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_bf16_bf16.hpp
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+ bfloat16 * &out_ptr, const bfloat16 * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset], LSL #1\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset], LSL #1\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset], LSL #1\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset], LSL #1\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset], LSL #1\n"
+ "add x22, x22, %x[row_offset], LSL #1\n"
+ "add x21, x21, %x[row_offset], LSL #1\n"
+ "add x20, x20, %x[row_offset], LSL #1\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x8\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q27, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q18, [x22], #0x10\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "str q24, [%x[out_ptr], #0x40]\n"
+ "str q21, [%x[out_ptr], #0x50]\n"
+ "str q18, [%x[out_ptr], #0x60]\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "subs %x[width], %x[width], #0x8\n"
+ "cmp %x[width], #0x8\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 8f\n"
+ "tbz %x[width], #2, 5f\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d24, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
+ "ld1 { v24.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v27.h }[6], [x27]\n"
+ "ld1 { v24.h }[6], [x26]\n"
+ "ld1 { v25.h }[6], [x25]\n"
+ "ld1 { v21.h }[6], [x24]\n"
+ "ld1 { v22.h }[6], [x23]\n"
+ "ld1 { v18.h }[6], [x22]\n"
+ "ld1 { v19.h }[6], [x21]\n"
+ "ld1 { v16.h }[6], [x20]\n"
+ "b 7f\n"
+ "4:" // odd_loads_1_4
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v27.h }[4], [x27]\n"
+ "ld1 { v24.h }[4], [x26]\n"
+ "ld1 { v25.h }[4], [x25]\n"
+ "ld1 { v21.h }[4], [x24]\n"
+ "ld1 { v22.h }[4], [x23]\n"
+ "ld1 { v18.h }[4], [x22]\n"
+ "ld1 { v19.h }[4], [x21]\n"
+ "ld1 { v16.h }[4], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 7f\n"
+ "5:" // odd_loads_2_0
+ "tbz %x[width], #1, 6f\n"
+ "ldr s27, [x27], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s18, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 7f\n"
+ "ld1 { v27.h }[2], [x27]\n"
+ "ld1 { v24.h }[2], [x26]\n"
+ "ld1 { v25.h }[2], [x25]\n"
+ "ld1 { v21.h }[2], [x24]\n"
+ "ld1 { v22.h }[2], [x23]\n"
+ "ld1 { v18.h }[2], [x22]\n"
+ "ld1 { v19.h }[2], [x21]\n"
+ "ld1 { v16.h }[2], [x20]\n"
+ "b 7f\n"
+ "6:" // odd_loads_1_0
+ "ldr h27, [x27, #0x0]\n"
+ "ldr h24, [x26, #0x0]\n"
+ "ldr h25, [x25, #0x0]\n"
+ "ldr h21, [x24, #0x0]\n"
+ "ldr h22, [x23, #0x0]\n"
+ "ldr h18, [x22, #0x0]\n"
+ "ldr h19, [x21, #0x0]\n"
+ "ldr h16, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "7:" // Odd load end
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "beq 8f\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "str q21, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q18, [%x[out_ptr], #0x20]\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "8:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
new file mode 100644
index 0000000000..a9034f5742
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8.hpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+ int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x10\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q28, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q27, [x26], #0x10\n"
+ "ldr q26, [x25], #0x10\n"
+ "zip1 v23.4s, v28.4s, v26.4s\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q22, [x24], #0x10\n"
+ "zip2 v26.4s, v28.4s, v26.4s\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q25, [x23], #0x10\n"
+ "zip1 v20.4s, v27.4s, v22.4s\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q24, [x22], #0x10\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v23.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "ldr q21, [x20], #0x10\n"
+ "zip1 v18.4s, v25.4s, v19.4s\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.4s, v26.4s, v22.4s\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v16.4s, v24.4s, v21.4s\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "cmp %x[width], #0x10\n"
+ "zip2 v16.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "zip2 v19.4s, v25.4s, v19.4s\n"
+ "str q23, [%x[out_ptr], #0x20]\n"
+ "zip2 v18.4s, v24.4s, v21.4s\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q20, [%x[out_ptr], #0x40]\n"
+ "zip2 v17.4s, v26.4s, v22.4s\n"
+ "str q16, [%x[out_ptr], #0x50]\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 12f\n"
+ "tbz %x[width], #3, 7f\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d27, [x26], #0x8\n"
+ "ldr d26, [x25], #0x8\n"
+ "ldr d22, [x24], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d24, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d21, [x20], #0x8\n"
+ "tbz %x[width], #2, 5f\n"
+ "ld1 { v28.s }[2], [x27], #0x4\n"
+ "ld1 { v27.s }[2], [x26], #0x4\n"
+ "ld1 { v26.s }[2], [x25], #0x4\n"
+ "ld1 { v22.s }[2], [x24], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v24.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v21.s }[2], [x20], #0x4\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v28.h }[6], [x27], #0x2\n"
+ "ld1 { v27.h }[6], [x26], #0x2\n"
+ "ld1 { v26.h }[6], [x25], #0x2\n"
+ "ld1 { v22.h }[6], [x24], #0x2\n"
+ "ld1 { v25.h }[6], [x23], #0x2\n"
+ "ld1 { v24.h }[6], [x22], #0x2\n"
+ "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v21.h }[6], [x20], #0x2\n"
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v28.b }[14], [x27]\n"
+ "ld1 { v27.b }[14], [x26]\n"
+ "ld1 { v26.b }[14], [x25]\n"
+ "ld1 { v22.b }[14], [x24]\n"
+ "ld1 { v25.b }[14], [x23]\n"
+ "ld1 { v24.b }[14], [x22]\n"
+ "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v21.b }[14], [x20]\n"
+ "b 11f\n"
+ "4:" // odd_loads_1_12
+ "mov x19, #0x3\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v28.b }[12], [x27]\n"
+ "ld1 { v27.b }[12], [x26]\n"
+ "ld1 { v26.b }[12], [x25]\n"
+ "ld1 { v22.b }[12], [x24]\n"
+ "ld1 { v25.b }[12], [x23]\n"
+ "ld1 { v24.b }[12], [x22]\n"
+ "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v21.b }[12], [x20]\n"
+ "mov x19, #0x4\n"
+ "b 11f\n"
+ "5:" // odd_loads_2_8
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v28.h }[4], [x27], #0x2\n"
+ "ld1 { v27.h }[4], [x26], #0x2\n"
+ "ld1 { v26.h }[4], [x25], #0x2\n"
+ "ld1 { v22.h }[4], [x24], #0x2\n"
+ "ld1 { v25.h }[4], [x23], #0x2\n"
+ "ld1 { v24.h }[4], [x22], #0x2\n"
+ "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v21.h }[4], [x20], #0x2\n"
+ "mov x19, #0x3\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v28.b }[10], [x27]\n"
+ "ld1 { v27.b }[10], [x26]\n"
+ "ld1 { v26.b }[10], [x25]\n"
+ "ld1 { v22.b }[10], [x24]\n"
+ "ld1 { v25.b }[10], [x23]\n"
+ "ld1 { v24.b }[10], [x22]\n"
+ "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v21.b }[10], [x20]\n"
+ "b 11f\n"
+ "6:" // odd_loads_1_8
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v28.b }[8], [x27]\n"
+ "ld1 { v27.b }[8], [x26]\n"
+ "ld1 { v26.b }[8], [x25]\n"
+ "ld1 { v22.b }[8], [x24]\n"
+ "ld1 { v25.b }[8], [x23]\n"
+ "ld1 { v24.b }[8], [x22]\n"
+ "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v21.b }[8], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 11f\n"
+ "7:" // odd_loads_4_0
+ "tbz %x[width], #2, 9f\n"
+ "ldr s28, [x27], #0x4\n"
+ "ldr s27, [x26], #0x4\n"
+ "ldr s26, [x25], #0x4\n"
+ "ldr s22, [x24], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s24, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s21, [x20], #0x4\n"
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v28.h }[2], [x27], #0x2\n"
+ "ld1 { v27.h }[2], [x26], #0x2\n"
+ "ld1 { v26.h }[2], [x25], #0x2\n"
+ "ld1 { v22.h }[2], [x24], #0x2\n"
+ "ld1 { v25.h }[2], [x23], #0x2\n"
+ "ld1 { v24.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v21.h }[2], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v28.b }[6], [x27]\n"
+ "ld1 { v27.b }[6], [x26]\n"
+ "ld1 { v26.b }[6], [x25]\n"
+ "ld1 { v22.b }[6], [x24]\n"
+ "ld1 { v25.b }[6], [x23]\n"
+ "ld1 { v24.b }[6], [x22]\n"
+ "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v21.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // odd_loads_1_4
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v28.b }[4], [x27]\n"
+ "ld1 { v27.b }[4], [x26]\n"
+ "ld1 { v26.b }[4], [x25]\n"
+ "ld1 { v22.b }[4], [x24]\n"
+ "ld1 { v25.b }[4], [x23]\n"
+ "ld1 { v24.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v21.b }[4], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 11f\n"
+ "9:" // odd_loads_2_0
+ "tbz %x[width], #1, 10f\n"
+ "ldr h28, [x27], #0x2\n"
+ "ldr h27, [x26], #0x2\n"
+ "ldr h26, [x25], #0x2\n"
+ "ldr h22, [x24], #0x2\n"
+ "ldr h25, [x23], #0x2\n"
+ "ldr h24, [x22], #0x2\n"
+ "ldr h19, [x21], #0x2\n"
+ "ldr h21, [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v28.b }[2], [x27]\n"
+ "ld1 { v27.b }[2], [x26]\n"
+ "ld1 { v26.b }[2], [x25]\n"
+ "ld1 { v22.b }[2], [x24]\n"
+ "ld1 { v25.b }[2], [x23]\n"
+ "ld1 { v24.b }[2], [x22]\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v21.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // odd_loads_1_0
+ "ldr b28, [x27, #0x0]\n"
+ "ldr b27, [x26, #0x0]\n"
+ "ldr b26, [x25, #0x0]\n"
+ "ldr b22, [x24, #0x0]\n"
+ "ldr b25, [x23, #0x0]\n"
+ "ldr b24, [x22, #0x0]\n"
+ "ldr b19, [x21, #0x0]\n"
+ "ldr b21, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "11:" // Odd load end
+ "zip1 v23.4s, v28.4s, v26.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v20.4s, v27.4s, v22.4s\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v25.4s, v19.4s\n"
+ "zip1 v16.4s, v24.4s, v21.4s\n"
+ "zip1 v17.4s, v18.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 12f\n"
+ "zip2 v23.4s, v23.4s, v20.4s\n"
+ "zip2 v16.4s, v18.4s, v16.4s\n"
+ "str q23, [%x[out_ptr], #0x0]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "subs x19, x19, #0x1\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 12f\n"
+ "zip2 v26.4s, v28.4s, v26.4s\n"
+ "zip2 v22.4s, v27.4s, v22.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v20.4s, v26.4s, v22.4s\n"
+ "str q20, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v25.4s, v19.4s\n"
+ "zip2 v18.4s, v24.4s, v21.4s\n"
+ "zip1 v16.4s, v19.4s, v18.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 12f\n"
+ "zip2 v17.4s, v26.4s, v22.4s\n"
+ "zip2 v16.4s, v19.4s, v18.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "12:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+template<>
+void interleave_block<8, 4, VLType::None, false>(
+ uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+ const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+ interleave_block<8, 4, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
new file mode 100644
index 0000000000..2831cb79a6
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_s8_s8_summing.hpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, true>(
+ int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v1.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v0.8h, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "movi v30.4s, #0x0\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v31.4s }, [%x[out_ptr]]\n"
+ "ldr q30, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x10\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0x1e\n"
+ "ble 4f\n"
+ "sadalp v31.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "sadalp v30.4s, v0.8h\n"
+ "movi v0.8h, #0x0\n"
+ "mov x19, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q29, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q28, [x26], #0x10\n"
+ "ldr q27, [x25], #0x10\n"
+ "zip1 v23.4s, v29.4s, v27.4s\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip2 v27.4s, v29.4s, v27.4s\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q26, [x23], #0x10\n"
+ "zip1 v20.4s, v28.4s, v21.4s\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q25, [x22], #0x10\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v24.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "zip2 v23.4s, v28.4s, v21.4s\n"
+ "ldr q22, [x20], #0x10\n"
+ "zip1 v18.4s, v26.4s, v19.4s\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v21.4s, v27.4s, v23.4s\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v17.4s, v25.4s, v22.4s\n"
+ "sadalp v1.8h, v16.16b\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "add x19, x19, #0x1\n"
+ "zip2 v20.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v19.4s, v26.4s, v19.4s\n"
+ "sadalp v0.8h, v16.16b\n"
+ "zip2 v16.4s, v25.4s, v22.4s\n"
+ "str q24, [%x[out_ptr], #0x20]\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "sadalp v1.8h, v24.16b\n"
+ "zip2 v17.4s, v27.4s, v23.4s\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q21, [%x[out_ptr], #0x40]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
+ "sadalp v0.8h, v20.16b\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
+ "sadalp v1.8h, v21.16b\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "sadalp v0.8h, v18.16b\n"
+ "cmp %x[width], #0x10\n"
+ "sadalp v1.8h, v17.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "sadalp v0.8h, v16.16b\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 14f\n"
+ "tbz %x[width], #3, 9f\n"
+ "ldr d29, [x27], #0x8\n"
+ "ldr d28, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
+ "tbz %x[width], #2, 7f\n"
+ "ld1 { v29.s }[2], [x27], #0x4\n"
+ "ld1 { v28.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v29.h }[6], [x27], #0x2\n"
+ "ld1 { v28.h }[6], [x26], #0x2\n"
+ "ld1 { v27.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v26.h }[6], [x23], #0x2\n"
+ "ld1 { v25.h }[6], [x22], #0x2\n"
+ "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[14], [x27]\n"
+ "ld1 { v28.b }[14], [x26]\n"
+ "ld1 { v27.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v26.b }[14], [x23]\n"
+ "ld1 { v25.b }[14], [x22]\n"
+ "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v22.b }[14], [x20]\n"
+ "b 13f\n"
+ "6:" // odd_loads_1_12
+ "mov x19, #0x3\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[12], [x27]\n"
+ "ld1 { v28.b }[12], [x26]\n"
+ "ld1 { v27.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v26.b }[12], [x23]\n"
+ "ld1 { v25.b }[12], [x22]\n"
+ "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v22.b }[12], [x20]\n"
+ "mov x19, #0x4\n"
+ "b 13f\n"
+ "7:" // odd_loads_2_8
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v29.h }[4], [x27], #0x2\n"
+ "ld1 { v28.h }[4], [x26], #0x2\n"
+ "ld1 { v27.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v26.h }[4], [x23], #0x2\n"
+ "ld1 { v25.h }[4], [x22], #0x2\n"
+ "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
+ "mov x19, #0x3\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[10], [x27]\n"
+ "ld1 { v28.b }[10], [x26]\n"
+ "ld1 { v27.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v26.b }[10], [x23]\n"
+ "ld1 { v25.b }[10], [x22]\n"
+ "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v22.b }[10], [x20]\n"
+ "b 13f\n"
+ "8:" // odd_loads_1_8
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[8], [x27]\n"
+ "ld1 { v28.b }[8], [x26]\n"
+ "ld1 { v27.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v26.b }[8], [x23]\n"
+ "ld1 { v25.b }[8], [x22]\n"
+ "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v22.b }[8], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 13f\n"
+ "9:" // odd_loads_4_0
+ "tbz %x[width], #2, 11f\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
+ "tbz %x[width], #1, 10f\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v26.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[6], [x27]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v26.b }[6], [x23]\n"
+ "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v22.b }[6], [x20]\n"
+ "b 13f\n"
+ "10:" // odd_loads_1_4
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v26.b }[4], [x23]\n"
+ "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v22.b }[4], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 13f\n"
+ "11:" // odd_loads_2_0
+ "tbz %x[width], #1, 12f\n"
+ "ldr h29, [x27], #0x2\n"
+ "ldr h28, [x26], #0x2\n"
+ "ldr h27, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h26, [x23], #0x2\n"
+ "ldr h25, [x22], #0x2\n"
+ "ldr h19, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[2], [x27]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v26.b }[2], [x23]\n"
+ "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v22.b }[2], [x20]\n"
+ "b 13f\n"
+ "12:" // odd_loads_1_0
+ "ldr b29, [x27, #0x0]\n"
+ "ldr b28, [x26, #0x0]\n"
+ "ldr b27, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b26, [x23, #0x0]\n"
+ "ldr b25, [x22, #0x0]\n"
+ "ldr b19, [x21, #0x0]\n"
+ "ldr b22, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "13:" // Odd load end
+ "zip1 v23.4s, v29.4s, v27.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v20.4s, v28.4s, v21.4s\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v26.4s, v19.4s\n"
+ "sadalp v1.8h, v16.16b\n"
+ "zip1 v17.4s, v25.4s, v22.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "sadalp v0.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 14f\n"
+ "zip2 v24.4s, v23.4s, v20.4s\n"
+ "zip2 v20.4s, v18.4s, v17.4s\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "sadalp v1.8h, v24.16b\n"
+ "str q20, [%x[out_ptr], #0x10]\n"
+ "sadalp v0.8h, v20.16b\n"
+ "subs x19, x19, #0x1\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 14f\n"
+ "zip2 v27.4s, v29.4s, v27.4s\n"
+ "zip2 v23.4s, v28.4s, v21.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v21.4s, v27.4s, v23.4s\n"
+ "str q21, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v26.4s, v19.4s\n"
+ "sadalp v1.8h, v21.16b\n"
+ "zip2 v16.4s, v25.4s, v22.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "sadalp v0.8h, v18.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 14f\n"
+ "zip2 v17.4s, v27.4s, v23.4s\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "sadalp v1.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "sadalp v0.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "14:" // Odds skip
+ "sadalp v31.4s, v1.8h\n"
+ "sadalp v30.4s, v0.8h\n"
+ "str q31, [%x[out_ptr], #0x0]\n"
+ "str q30, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
new file mode 100644
index 0000000000..7c7857bcd0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block4_u8_u8_summing.hpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 4, VLType::None, true>(
+ uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v1.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v0.8h, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "movi v30.4s, #0x0\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v31.4s }, [%x[out_ptr]]\n"
+ "ldr q30, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x10\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0x1e\n"
+ "ble 4f\n"
+ "uadalp v31.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "uadalp v30.4s, v0.8h\n"
+ "movi v0.8h, #0x0\n"
+ "mov x19, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q29, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q28, [x26], #0x10\n"
+ "ldr q27, [x25], #0x10\n"
+ "zip1 v23.4s, v29.4s, v27.4s\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip2 v27.4s, v29.4s, v27.4s\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q26, [x23], #0x10\n"
+ "zip1 v20.4s, v28.4s, v21.4s\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q25, [x22], #0x10\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v24.4s, v23.4s, v20.4s\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "zip2 v23.4s, v28.4s, v21.4s\n"
+ "ldr q22, [x20], #0x10\n"
+ "zip1 v18.4s, v26.4s, v19.4s\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v21.4s, v27.4s, v23.4s\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "zip1 v17.4s, v25.4s, v22.4s\n"
+ "uadalp v1.8h, v16.16b\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "add x19, x19, #0x1\n"
+ "zip2 v20.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "zip2 v19.4s, v26.4s, v19.4s\n"
+ "uadalp v0.8h, v16.16b\n"
+ "zip2 v16.4s, v25.4s, v22.4s\n"
+ "str q24, [%x[out_ptr], #0x20]\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "uadalp v1.8h, v24.16b\n"
+ "zip2 v17.4s, v27.4s, v23.4s\n"
+ "str q20, [%x[out_ptr], #0x30]\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q21, [%x[out_ptr], #0x40]\n"
+ "str q18, [%x[out_ptr], #0x50]\n"
+ "uadalp v0.8h, v20.16b\n"
+ "str q17, [%x[out_ptr], #0x60]\n"
+ "uadalp v1.8h, v21.16b\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "uadalp v0.8h, v18.16b\n"
+ "cmp %x[width], #0x10\n"
+ "uadalp v1.8h, v17.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "uadalp v0.8h, v16.16b\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 14f\n"
+ "tbz %x[width], #3, 9f\n"
+ "ldr d29, [x27], #0x8\n"
+ "ldr d28, [x26], #0x8\n"
+ "ldr d27, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d25, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d22, [x20], #0x8\n"
+ "tbz %x[width], #2, 7f\n"
+ "ld1 { v29.s }[2], [x27], #0x4\n"
+ "ld1 { v28.s }[2], [x26], #0x4\n"
+ "ld1 { v27.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v25.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v22.s }[2], [x20], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v29.h }[6], [x27], #0x2\n"
+ "ld1 { v28.h }[6], [x26], #0x2\n"
+ "ld1 { v27.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v26.h }[6], [x23], #0x2\n"
+ "ld1 { v25.h }[6], [x22], #0x2\n"
+ "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v22.h }[6], [x20], #0x2\n"
+ "mov x19, #0x4\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[14], [x27]\n"
+ "ld1 { v28.b }[14], [x26]\n"
+ "ld1 { v27.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v26.b }[14], [x23]\n"
+ "ld1 { v25.b }[14], [x22]\n"
+ "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v22.b }[14], [x20]\n"
+ "b 13f\n"
+ "6:" // odd_loads_1_12
+ "mov x19, #0x3\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[12], [x27]\n"
+ "ld1 { v28.b }[12], [x26]\n"
+ "ld1 { v27.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v26.b }[12], [x23]\n"
+ "ld1 { v25.b }[12], [x22]\n"
+ "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v22.b }[12], [x20]\n"
+ "mov x19, #0x4\n"
+ "b 13f\n"
+ "7:" // odd_loads_2_8
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v29.h }[4], [x27], #0x2\n"
+ "ld1 { v28.h }[4], [x26], #0x2\n"
+ "ld1 { v27.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v26.h }[4], [x23], #0x2\n"
+ "ld1 { v25.h }[4], [x22], #0x2\n"
+ "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v22.h }[4], [x20], #0x2\n"
+ "mov x19, #0x3\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[10], [x27]\n"
+ "ld1 { v28.b }[10], [x26]\n"
+ "ld1 { v27.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v26.b }[10], [x23]\n"
+ "ld1 { v25.b }[10], [x22]\n"
+ "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v22.b }[10], [x20]\n"
+ "b 13f\n"
+ "8:" // odd_loads_1_8
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[8], [x27]\n"
+ "ld1 { v28.b }[8], [x26]\n"
+ "ld1 { v27.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v26.b }[8], [x23]\n"
+ "ld1 { v25.b }[8], [x22]\n"
+ "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v22.b }[8], [x20]\n"
+ "mov x19, #0x3\n"
+ "b 13f\n"
+ "9:" // odd_loads_4_0
+ "tbz %x[width], #2, 11f\n"
+ "ldr s29, [x27], #0x4\n"
+ "ldr s28, [x26], #0x4\n"
+ "ldr s27, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr s25, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s22, [x20], #0x4\n"
+ "tbz %x[width], #1, 10f\n"
+ "ld1 { v29.h }[2], [x27], #0x2\n"
+ "ld1 { v28.h }[2], [x26], #0x2\n"
+ "ld1 { v27.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v26.h }[2], [x23], #0x2\n"
+ "ld1 { v25.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v22.h }[2], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[6], [x27]\n"
+ "ld1 { v28.b }[6], [x26]\n"
+ "ld1 { v27.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v26.b }[6], [x23]\n"
+ "ld1 { v25.b }[6], [x22]\n"
+ "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v22.b }[6], [x20]\n"
+ "b 13f\n"
+ "10:" // odd_loads_1_4
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[4], [x27]\n"
+ "ld1 { v28.b }[4], [x26]\n"
+ "ld1 { v27.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v26.b }[4], [x23]\n"
+ "ld1 { v25.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v22.b }[4], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 13f\n"
+ "11:" // odd_loads_2_0
+ "tbz %x[width], #1, 12f\n"
+ "ldr h29, [x27], #0x2\n"
+ "ldr h28, [x26], #0x2\n"
+ "ldr h27, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h26, [x23], #0x2\n"
+ "ldr h25, [x22], #0x2\n"
+ "ldr h19, [x21], #0x2\n"
+ "ldr h22, [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v29.b }[2], [x27]\n"
+ "ld1 { v28.b }[2], [x26]\n"
+ "ld1 { v27.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v26.b }[2], [x23]\n"
+ "ld1 { v25.b }[2], [x22]\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v22.b }[2], [x20]\n"
+ "b 13f\n"
+ "12:" // odd_loads_1_0
+ "ldr b29, [x27, #0x0]\n"
+ "ldr b28, [x26, #0x0]\n"
+ "ldr b27, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b26, [x23, #0x0]\n"
+ "ldr b25, [x22, #0x0]\n"
+ "ldr b19, [x21, #0x0]\n"
+ "ldr b22, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "13:" // Odd load end
+ "zip1 v23.4s, v29.4s, v27.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v20.4s, v28.4s, v21.4s\n"
+ "zip1 v16.4s, v23.4s, v20.4s\n"
+ "str q16, [%x[out_ptr], #0x0]\n"
+ "zip1 v18.4s, v26.4s, v19.4s\n"
+ "uadalp v1.8h, v16.16b\n"
+ "zip1 v17.4s, v25.4s, v22.4s\n"
+ "zip1 v16.4s, v18.4s, v17.4s\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "uadalp v0.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 14f\n"
+ "zip2 v24.4s, v23.4s, v20.4s\n"
+ "zip2 v20.4s, v18.4s, v17.4s\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "uadalp v1.8h, v24.16b\n"
+ "str q20, [%x[out_ptr], #0x10]\n"
+ "uadalp v0.8h, v20.16b\n"
+ "subs x19, x19, #0x1\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 14f\n"
+ "zip2 v27.4s, v29.4s, v27.4s\n"
+ "zip2 v23.4s, v28.4s, v21.4s\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v21.4s, v27.4s, v23.4s\n"
+ "str q21, [%x[out_ptr], #0x0]\n"
+ "zip2 v19.4s, v26.4s, v19.4s\n"
+ "uadalp v1.8h, v21.16b\n"
+ "zip2 v16.4s, v25.4s, v22.4s\n"
+ "zip1 v18.4s, v19.4s, v16.4s\n"
+ "str q18, [%x[out_ptr], #0x10]\n"
+ "uadalp v0.8h, v18.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "beq 14f\n"
+ "zip2 v17.4s, v27.4s, v23.4s\n"
+ "zip2 v16.4s, v19.4s, v16.4s\n"
+ "str q17, [%x[out_ptr], #0x0]\n"
+ "uadalp v1.8h, v17.16b\n"
+ "str q16, [%x[out_ptr], #0x10]\n"
+ "uadalp v0.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ "14:" // Odds skip
+ "uadalp v31.4s, v1.8h\n"
+ "uadalp v30.4s, v0.8h\n"
+ "str q31, [%x[out_ptr], #0x0]\n"
+ "str q30, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
new file mode 100644
index 0000000000..704a4c9210
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8.hpp
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, false>(
+ int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ __asm__ __volatile__(
+ "ldr x27, [%x[in], #0x0]\n"
+ "cmp %x[height], #0x8\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "cmp %x[width], #0x10\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "blt 3f\n"
+ "2:" // Main loop head
+ "ldr q27, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q18, [x22], #0x10\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "str q24, [%x[out_ptr], #0x40]\n"
+ "str q21, [%x[out_ptr], #0x50]\n"
+ "str q18, [%x[out_ptr], #0x60]\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 2b\n"
+ "3:" // Main loop skip
+ "cbz %x[width], 12f\n"
+ "tbz %x[width], #3, 7f\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d24, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[width], #2, 5f\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
+ "ld1 { v24.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[width], #1, 4f\n"
+ "ld1 { v27.h }[6], [x27], #0x2\n"
+ "ld1 { v24.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v18.h }[6], [x22], #0x2\n"
+ "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v27.b }[14], [x27]\n"
+ "ld1 { v24.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v18.b }[14], [x22]\n"
+ "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v16.b }[14], [x20]\n"
+ "b 11f\n"
+ "4:" // odd_loads_1_12
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v27.b }[12], [x27]\n"
+ "ld1 { v24.b }[12], [x26]\n"
+ "ld1 { v25.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v18.b }[12], [x22]\n"
+ "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v16.b }[12], [x20]\n"
+ "b 11f\n"
+ "5:" // odd_loads_2_8
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v27.h }[4], [x27], #0x2\n"
+ "ld1 { v24.h }[4], [x26], #0x2\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v18.h }[4], [x22], #0x2\n"
+ "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v27.b }[10], [x27]\n"
+ "ld1 { v24.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v18.b }[10], [x22]\n"
+ "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v16.b }[10], [x20]\n"
+ "b 11f\n"
+ "6:" // odd_loads_1_8
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v27.b }[8], [x27]\n"
+ "ld1 { v24.b }[8], [x26]\n"
+ "ld1 { v25.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v18.b }[8], [x22]\n"
+ "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v16.b }[8], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 11f\n"
+ "7:" // odd_loads_4_0
+ "tbz %x[width], #2, 9f\n"
+ "ldr s27, [x27], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s18, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v27.h }[2], [x27], #0x2\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v18.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v27.b }[6], [x27]\n"
+ "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v18.b }[6], [x22]\n"
+ "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 11f\n"
+ "8:" // odd_loads_1_4
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v27.b }[4], [x27]\n"
+ "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v18.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 11f\n"
+ "9:" // odd_loads_2_0
+ "tbz %x[width], #1, 10f\n"
+ "ldr h27, [x27], #0x2\n"
+ "ldr h24, [x26], #0x2\n"
+ "ldr h25, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "ldr h18, [x22], #0x2\n"
+ "ldr h19, [x21], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 11f\n"
+ "ld1 { v27.b }[2], [x27]\n"
+ "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v18.b }[2], [x22]\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 11f\n"
+ "10:" // odd_loads_1_0
+ "ldr b27, [x27, #0x0]\n"
+ "ldr b24, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "ldr b18, [x22, #0x0]\n"
+ "ldr b19, [x21, #0x0]\n"
+ "ldr b16, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "11:" // Odd load end
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "beq 12f\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "str q21, [%x[out_ptr], #0x10]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q18, [%x[out_ptr], #0x20]\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "12:" // Odds skip
+
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+template<>
+void interleave_block<8, 8, VLType::None, false>(
+ uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool
+)
+{
+ int8_t * &out_cast = reinterpret_cast<int8_t * &>(out_ptr);
+ const int8_t * const * in_cast = reinterpret_cast<const int8_t * const *>(in);
+
+ interleave_block<8, 8, VLType::None, false>(out_cast, in_cast, width, height, row_offset, false);
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
new file mode 100644
index 0000000000..2317ece790
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_s8_s8_summing.hpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, true>(
+ int8_t * &out_ptr, const int8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v5.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v4.8h, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v3.8h, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "movi v2.8h, #0x0\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "movi v1.4s, #0x0\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "movi v0.4s, #0x0\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "movi v30.4s, #0x0\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "movi v29.4s, #0x0\n"
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "movi v28.4s, #0x0\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v29.4s }, [%x[out_ptr]]\n"
+ "ldr q28, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x10\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0x3e\n"
+ "ble 4f\n"
+ "sadalp v1.4s, v5.8h\n"
+ "movi v5.8h, #0x0\n"
+ "sadalp v0.4s, v4.8h\n"
+ "movi v4.8h, #0x0\n"
+ "sadalp v31.4s, v3.8h\n"
+ "movi v3.8h, #0x0\n"
+ "sadalp v30.4s, v2.8h\n"
+ "movi v2.8h, #0x0\n"
+ "mov x19, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q27, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q18, [x22], #0x10\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "sadalp v5.8h, v26.16b\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "sadalp v4.8h, v23.16b\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "sadalp v3.8h, v20.16b\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "sadalp v2.8h, v17.16b\n"
+ "str q24, [%x[out_ptr], #0x40]\n"
+ "sadalp v5.8h, v24.16b\n"
+ "str q21, [%x[out_ptr], #0x50]\n"
+ "sadalp v4.8h, v21.16b\n"
+ "str q18, [%x[out_ptr], #0x60]\n"
+ "sadalp v3.8h, v18.16b\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "sadalp v2.8h, v16.16b\n"
+ "add x19, x19, #0x1\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 14f\n"
+ "tbz %x[width], #3, 9f\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d24, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[width], #2, 7f\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
+ "ld1 { v24.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v27.h }[6], [x27], #0x2\n"
+ "ld1 { v24.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v18.h }[6], [x22], #0x2\n"
+ "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[14], [x27]\n"
+ "ld1 { v24.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v18.b }[14], [x22]\n"
+ "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v16.b }[14], [x20]\n"
+ "b 13f\n"
+ "6:" // odd_loads_1_12
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[12], [x27]\n"
+ "ld1 { v24.b }[12], [x26]\n"
+ "ld1 { v25.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v18.b }[12], [x22]\n"
+ "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v16.b }[12], [x20]\n"
+ "b 13f\n"
+ "7:" // odd_loads_2_8
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v27.h }[4], [x27], #0x2\n"
+ "ld1 { v24.h }[4], [x26], #0x2\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v18.h }[4], [x22], #0x2\n"
+ "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[10], [x27]\n"
+ "ld1 { v24.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v18.b }[10], [x22]\n"
+ "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v16.b }[10], [x20]\n"
+ "b 13f\n"
+ "8:" // odd_loads_1_8
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[8], [x27]\n"
+ "ld1 { v24.b }[8], [x26]\n"
+ "ld1 { v25.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v18.b }[8], [x22]\n"
+ "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v16.b }[8], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 13f\n"
+ "9:" // odd_loads_4_0
+ "tbz %x[width], #2, 11f\n"
+ "ldr s27, [x27], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s18, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz %x[width], #1, 10f\n"
+ "ld1 { v27.h }[2], [x27], #0x2\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v18.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[6], [x27]\n"
+ "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v18.b }[6], [x22]\n"
+ "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 13f\n"
+ "10:" // odd_loads_1_4
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[4], [x27]\n"
+ "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v18.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 13f\n"
+ "11:" // odd_loads_2_0
+ "tbz %x[width], #1, 12f\n"
+ "ldr h27, [x27], #0x2\n"
+ "ldr h24, [x26], #0x2\n"
+ "ldr h25, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "ldr h18, [x22], #0x2\n"
+ "ldr h19, [x21], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[2], [x27]\n"
+ "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v18.b }[2], [x22]\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 13f\n"
+ "12:" // odd_loads_1_0
+ "ldr b27, [x27, #0x0]\n"
+ "ldr b24, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "ldr b18, [x22, #0x0]\n"
+ "ldr b19, [x21, #0x0]\n"
+ "ldr b16, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "13:" // Odd load end
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "sadalp v5.8h, v26.16b\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "sadalp v4.8h, v23.16b\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "sadalp v3.8h, v20.16b\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "sadalp v2.8h, v17.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "beq 14f\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "sadalp v5.8h, v24.16b\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q21, [%x[out_ptr], #0x10]\n"
+ "sadalp v4.8h, v21.16b\n"
+ "str q18, [%x[out_ptr], #0x20]\n"
+ "sadalp v3.8h, v18.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "sadalp v2.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "14:" // Odds skip
+ "sadalp v1.4s, v5.8h\n"
+ "sadalp v0.4s, v4.8h\n"
+ "addp v1.4s, v1.4s, v0.4s\n"
+ "sadalp v31.4s, v3.8h\n"
+ "sadalp v30.4s, v2.8h\n"
+ "add v1.4s, v1.4s, v29.4s\n"
+ "str q1, [%x[out_ptr], #0x0]\n"
+ "addp v0.4s, v31.4s, v30.4s\n"
+ "add v0.4s, v0.4s, v28.4s\n"
+ "str q0, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
new file mode 100644
index 0000000000..07164d6b24
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/a64_interleave8_block8_u8_u8_summing.hpp
@@ -0,0 +1,362 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+template<>
+void interleave_block<8, 8, VLType::None, true>(
+ uint8_t * &out_ptr, const uint8_t * const * in, size_t width, size_t height,
+ size_t row_offset, bool first
+)
+{
+ __asm__ __volatile__(
+ "movi v5.8h, #0x0\n"
+ "ldr x27, [%x[in], #0x0]\n"
+ "mov x19, #0x0\n"
+ "movi v4.8h, #0x0\n"
+ "ldr x26, [%x[in], #0x8]\n"
+ "cmp %x[height], #0x8\n"
+ "movi v3.8h, #0x0\n"
+ "ldr x25, [%x[in], #0x10]\n"
+ "add x27, x27, %x[row_offset]\n"
+ "movi v2.8h, #0x0\n"
+ "ldr x24, [%x[in], #0x18]\n"
+ "movi v1.4s, #0x0\n"
+ "ldr x23, [%x[in], #0x20]\n"
+ "add x26, x26, %x[row_offset]\n"
+ "movi v0.4s, #0x0\n"
+ "ldr x22, [%x[in], #0x28]\n"
+ "add x25, x25, %x[row_offset]\n"
+ "movi v31.4s, #0x0\n"
+ "ldr x21, [%x[in], #0x30]\n"
+ "add x24, x24, %x[row_offset]\n"
+ "movi v30.4s, #0x0\n"
+ "ldr x20, [%x[in], #0x38]\n"
+ "add x23, x23, %x[row_offset]\n"
+ "add x22, x22, %x[row_offset]\n"
+ "add x21, x21, %x[row_offset]\n"
+ "add x20, x20, %x[row_offset]\n"
+ "beq 1f\n"
+ "mov x20, x27\n"
+ "cmp %x[height], #0x2\n"
+ "csel x26, x26, x27, GE\n"
+ "csel x25, x25, x27, GT\n"
+ "cmp %x[height], #0x4\n"
+ "csel x24, x24, x27, GE\n"
+ "csel x23, x23, x27, GT\n"
+ "cmp %x[height], #0x6\n"
+ "csel x22, x22, x27, GE\n"
+ "csel x21, x21, x27, GT\n"
+ "1:" // no_pointer_adj
+ "movi v29.4s, #0x0\n"
+ "prfm pldl1keep, [x27, #0x0]\n"
+ "movi v28.4s, #0x0\n"
+ "prfm pldl1keep, [x26, #0x0]\n"
+ "prfm pldl1keep, [x25, #0x0]\n"
+ "prfm pldl1keep, [x24, #0x0]\n"
+ "prfm pldl1keep, [x23, #0x0]\n"
+ "prfm pldl1keep, [x22, #0x0]\n"
+ "prfm pldl1keep, [x21, #0x0]\n"
+ "prfm pldl1keep, [x20, #0x0]\n"
+ "prfm pldl1keep, [x27, #0x40]\n"
+ "prfm pldl1keep, [x26, #0x40]\n"
+ "prfm pldl1keep, [x25, #0x40]\n"
+ "prfm pldl1keep, [x24, #0x40]\n"
+ "prfm pldl1keep, [x23, #0x40]\n"
+ "prfm pldl1keep, [x22, #0x40]\n"
+ "prfm pldl1keep, [x21, #0x40]\n"
+ "prfm pldl1keep, [x20, #0x40]\n"
+ "cbnz %w[first], 2f\n"
+ "sub %x[out_ptr], %x[out_ptr], #0x20\n"
+ "ld1 { v29.4s }, [%x[out_ptr]]\n"
+ "ldr q28, [%x[out_ptr], #0x10]\n"
+ "2:" // first_pass
+ "cmp %x[width], #0x10\n"
+ "blt 5f\n"
+ "3:" // Main loop head
+ "cmp x19, #0x3e\n"
+ "ble 4f\n"
+ "uadalp v1.4s, v5.8h\n"
+ "movi v5.8h, #0x0\n"
+ "uadalp v0.4s, v4.8h\n"
+ "movi v4.8h, #0x0\n"
+ "uadalp v31.4s, v3.8h\n"
+ "movi v3.8h, #0x0\n"
+ "uadalp v30.4s, v2.8h\n"
+ "movi v2.8h, #0x0\n"
+ "mov x19, #0x0\n"
+ "4:" // no_accumulate_16
+ "ldr q27, [x27], #0x10\n"
+ "prfm pldl1keep, [x27, #0x70]\n"
+ "ldr q24, [x26], #0x10\n"
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x26, #0x70]\n"
+ "ldr q25, [x25], #0x10\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "prfm pldl1keep, [x25, #0x70]\n"
+ "ldr q21, [x24], #0x10\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x24, #0x70]\n"
+ "ldr q22, [x23], #0x10\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "prfm pldl1keep, [x23, #0x70]\n"
+ "ldr q18, [x22], #0x10\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x22, #0x70]\n"
+ "ldr q19, [x21], #0x10\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "prfm pldl1keep, [x21, #0x70]\n"
+ "ldr q16, [x20], #0x10\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "prfm pldl1keep, [x20, #0x70]\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "uadalp v5.8h, v26.16b\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "uadalp v4.8h, v23.16b\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "uadalp v3.8h, v20.16b\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "uadalp v2.8h, v17.16b\n"
+ "str q24, [%x[out_ptr], #0x40]\n"
+ "uadalp v5.8h, v24.16b\n"
+ "str q21, [%x[out_ptr], #0x50]\n"
+ "uadalp v4.8h, v21.16b\n"
+ "str q18, [%x[out_ptr], #0x60]\n"
+ "uadalp v3.8h, v18.16b\n"
+ "str q16, [%x[out_ptr], #0x70]\n"
+ "uadalp v2.8h, v16.16b\n"
+ "add x19, x19, #0x1\n"
+ "subs %x[width], %x[width], #0x10\n"
+ "cmp %x[width], #0x10\n"
+ "add %x[out_ptr], %x[out_ptr], #0x80\n"
+ "bge 3b\n"
+ "5:" // Main loop skip
+ "cbz %x[width], 14f\n"
+ "tbz %x[width], #3, 9f\n"
+ "ldr d27, [x27], #0x8\n"
+ "ldr d24, [x26], #0x8\n"
+ "ldr d25, [x25], #0x8\n"
+ "ldr d21, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d18, [x22], #0x8\n"
+ "ldr d19, [x21], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz %x[width], #2, 7f\n"
+ "ld1 { v27.s }[2], [x27], #0x4\n"
+ "ld1 { v24.s }[2], [x26], #0x4\n"
+ "ld1 { v25.s }[2], [x25], #0x4\n"
+ "ld1 { v21.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v18.s }[2], [x22], #0x4\n"
+ "ld1 { v19.s }[2], [x21], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz %x[width], #1, 6f\n"
+ "ld1 { v27.h }[6], [x27], #0x2\n"
+ "ld1 { v24.h }[6], [x26], #0x2\n"
+ "ld1 { v25.h }[6], [x25], #0x2\n"
+ "ld1 { v21.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v18.h }[6], [x22], #0x2\n"
+ "ld1 { v19.h }[6], [x21], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[14], [x27]\n"
+ "ld1 { v24.b }[14], [x26]\n"
+ "ld1 { v25.b }[14], [x25]\n"
+ "ld1 { v21.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v18.b }[14], [x22]\n"
+ "ld1 { v19.b }[14], [x21]\n"
+ "ld1 { v16.b }[14], [x20]\n"
+ "b 13f\n"
+ "6:" // odd_loads_1_12
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[12], [x27]\n"
+ "ld1 { v24.b }[12], [x26]\n"
+ "ld1 { v25.b }[12], [x25]\n"
+ "ld1 { v21.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v18.b }[12], [x22]\n"
+ "ld1 { v19.b }[12], [x21]\n"
+ "ld1 { v16.b }[12], [x20]\n"
+ "b 13f\n"
+ "7:" // odd_loads_2_8
+ "tbz %x[width], #1, 8f\n"
+ "ld1 { v27.h }[4], [x27], #0x2\n"
+ "ld1 { v24.h }[4], [x26], #0x2\n"
+ "ld1 { v25.h }[4], [x25], #0x2\n"
+ "ld1 { v21.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v18.h }[4], [x22], #0x2\n"
+ "ld1 { v19.h }[4], [x21], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "mov x19, #0x2\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[10], [x27]\n"
+ "ld1 { v24.b }[10], [x26]\n"
+ "ld1 { v25.b }[10], [x25]\n"
+ "ld1 { v21.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v18.b }[10], [x22]\n"
+ "ld1 { v19.b }[10], [x21]\n"
+ "ld1 { v16.b }[10], [x20]\n"
+ "b 13f\n"
+ "8:" // odd_loads_1_8
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[8], [x27]\n"
+ "ld1 { v24.b }[8], [x26]\n"
+ "ld1 { v25.b }[8], [x25]\n"
+ "ld1 { v21.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v18.b }[8], [x22]\n"
+ "ld1 { v19.b }[8], [x21]\n"
+ "ld1 { v16.b }[8], [x20]\n"
+ "mov x19, #0x2\n"
+ "b 13f\n"
+ "9:" // odd_loads_4_0
+ "tbz %x[width], #2, 11f\n"
+ "ldr s27, [x27], #0x4\n"
+ "ldr s24, [x26], #0x4\n"
+ "ldr s25, [x25], #0x4\n"
+ "ldr s21, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s18, [x22], #0x4\n"
+ "ldr s19, [x21], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz %x[width], #1, 10f\n"
+ "ld1 { v27.h }[2], [x27], #0x2\n"
+ "ld1 { v24.h }[2], [x26], #0x2\n"
+ "ld1 { v25.h }[2], [x25], #0x2\n"
+ "ld1 { v21.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v18.h }[2], [x22], #0x2\n"
+ "ld1 { v19.h }[2], [x21], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[6], [x27]\n"
+ "ld1 { v24.b }[6], [x26]\n"
+ "ld1 { v25.b }[6], [x25]\n"
+ "ld1 { v21.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v18.b }[6], [x22]\n"
+ "ld1 { v19.b }[6], [x21]\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 13f\n"
+ "10:" // odd_loads_1_4
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[4], [x27]\n"
+ "ld1 { v24.b }[4], [x26]\n"
+ "ld1 { v25.b }[4], [x25]\n"
+ "ld1 { v21.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v18.b }[4], [x22]\n"
+ "ld1 { v19.b }[4], [x21]\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 13f\n"
+ "11:" // odd_loads_2_0
+ "tbz %x[width], #1, 12f\n"
+ "ldr h27, [x27], #0x2\n"
+ "ldr h24, [x26], #0x2\n"
+ "ldr h25, [x25], #0x2\n"
+ "ldr h21, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "ldr h18, [x22], #0x2\n"
+ "ldr h19, [x21], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "mov x19, #0x1\n"
+ "tbz %x[width], #0, 13f\n"
+ "ld1 { v27.b }[2], [x27]\n"
+ "ld1 { v24.b }[2], [x26]\n"
+ "ld1 { v25.b }[2], [x25]\n"
+ "ld1 { v21.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v18.b }[2], [x22]\n"
+ "ld1 { v19.b }[2], [x21]\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 13f\n"
+ "12:" // odd_loads_1_0
+ "ldr b27, [x27, #0x0]\n"
+ "ldr b24, [x26, #0x0]\n"
+ "ldr b25, [x25, #0x0]\n"
+ "ldr b21, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "ldr b18, [x22, #0x0]\n"
+ "ldr b19, [x21, #0x0]\n"
+ "ldr b16, [x20, #0x0]\n"
+ "mov x19, #0x1\n"
+ "13:" // Odd load end
+ "zip1 v26.2d, v27.2d, v24.2d\n"
+ "subs x19, x19, #0x1\n"
+ "zip1 v23.2d, v25.2d, v21.2d\n"
+ "str q26, [%x[out_ptr], #0x0]\n"
+ "zip1 v20.2d, v22.2d, v18.2d\n"
+ "uadalp v5.8h, v26.16b\n"
+ "zip1 v17.2d, v19.2d, v16.2d\n"
+ "str q23, [%x[out_ptr], #0x10]\n"
+ "uadalp v4.8h, v23.16b\n"
+ "str q20, [%x[out_ptr], #0x20]\n"
+ "uadalp v3.8h, v20.16b\n"
+ "str q17, [%x[out_ptr], #0x30]\n"
+ "uadalp v2.8h, v17.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "beq 14f\n"
+ "zip2 v24.2d, v27.2d, v24.2d\n"
+ "zip2 v21.2d, v25.2d, v21.2d\n"
+ "str q24, [%x[out_ptr], #0x0]\n"
+ "zip2 v18.2d, v22.2d, v18.2d\n"
+ "uadalp v5.8h, v24.16b\n"
+ "zip2 v16.2d, v19.2d, v16.2d\n"
+ "str q21, [%x[out_ptr], #0x10]\n"
+ "uadalp v4.8h, v21.16b\n"
+ "str q18, [%x[out_ptr], #0x20]\n"
+ "uadalp v3.8h, v18.16b\n"
+ "str q16, [%x[out_ptr], #0x30]\n"
+ "uadalp v2.8h, v16.16b\n"
+ "add %x[out_ptr], %x[out_ptr], #0x40\n"
+ "14:" // Odds skip
+ "uadalp v1.4s, v5.8h\n"
+ "uadalp v0.4s, v4.8h\n"
+ "addp v1.4s, v1.4s, v0.4s\n"
+ "uadalp v31.4s, v3.8h\n"
+ "uadalp v30.4s, v2.8h\n"
+ "add v1.4s, v1.4s, v29.4s\n"
+ "str q1, [%x[out_ptr], #0x0]\n"
+ "addp v0.4s, v31.4s, v30.4s\n"
+ "add v0.4s, v0.4s, v28.4s\n"
+ "str q0, [%x[out_ptr], #0x10]\n"
+ "add %x[out_ptr], %x[out_ptr], #0x20\n"
+ : [out_ptr] "+r" (out_ptr), [width] "+r" (width)
+ : [first] "r" (first), [height] "r" (height), [in] "r" (in), [row_offset] "r" (row_offset)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27"
+ );
+}
+
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
new file mode 100644
index 0000000000..52b49c0f0c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/indirect-interleaves/list.hpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "a32_interleave6_block1_fp32_fp32.hpp"
+#include "a64_interleave4_block16_s8_s8.hpp"
+#include "a64_interleave4_block16_s8_s8_summing.hpp"
+#include "a64_interleave4_block16_u8_u8_summing.hpp"
+#include "a64_interleave8_block1_bf16_fp32.hpp"
+#include "a64_interleave8_block1_fp16_fp16.hpp"
+#include "a64_interleave8_block1_fp16_fp32.hpp"
+#include "a64_interleave8_block1_fp32_fp32.hpp"
+#include "a64_interleave8_block1_s16_s16.hpp"
+#include "a64_interleave8_block1_s16_s16_summing.hpp"
+#include "a64_interleave8_block1_s8_s16.hpp"
+#include "a64_interleave8_block1_s8_s16_summing.hpp"
+#include "a64_interleave8_block1_u16_u16_summing.hpp"
+#include "a64_interleave8_block1_u8_u16.hpp"
+#include "a64_interleave8_block1_u8_u16_summing.hpp"
+#include "a64_interleave8_block2_bf16_bf16.hpp"
+#include "a64_interleave8_block2_fp32_fp32.hpp"
+#include "a64_interleave8_block4_bf16_bf16.hpp"
+#include "a64_interleave8_block4_s8_s8.hpp"
+#include "a64_interleave8_block4_s8_s8_summing.hpp"
+#include "a64_interleave8_block4_u8_u8_summing.hpp"
+#include "a64_interleave8_block8_s8_s8.hpp"
+#include "a64_interleave8_block8_s8_s8_summing.hpp"
+#include "a64_interleave8_block8_u8_u8_summing.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
new file mode 100644
index 0000000000..2b3e170a3b
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.cpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "asmlib.hpp"
+#include "convolution_parameters.hpp"
+#include "convolver.hpp"
+#include "interleave_indirect.hpp"
+#include "bfloat.hpp"
+
+#include <alloca.h>
+
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdio>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+#include <vector>
+
+#include <arm_neon.h>
+
+#include "utils.hpp"
+
+namespace arm_gemm {
+
+/*
+ * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together.
+ *
+ * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining
+ * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad
+ * with a particular value.
+ *
+ * Note that it is not expected for this templated version to ever be used - all cases that matter should be
+ * explicitly specialized with an optimized implementation.
+ */
+template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut>
+void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) {
+ const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ std::vector<int32_t> the_sums;
+
+ if (integrate_sums) {
+ the_sums = std::vector<int32_t>(int_by, 0);
+
+ if (!first) {
+ // In 'integrate sums' mode, we dump the sums at the end on each pass.
+
+ // On the last pass this is correct, but on other passes it is not -
+ // so on the subsequent pass we need to take the output written by
+ // the previous pass as starting point for the sums, and then
+ // overwrite them with new interleaved data.
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ // Rewind pointer to where we wrote out the sums last time.
+ out_int32 -= int_by;
+
+ // Restore the running sums.
+ memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
+
+ // Update the "real" pointer so that the next output will clobber the old sums.
+ out = reinterpret_cast<TOut *>(out_int32);
+ }
+ }
+
+ for (unsigned int pos=0; pos<width; pos+=block) {
+ for (unsigned int row=0; row<int_by; row++) {
+ // Row out of range - pad 'block' entries.
+ if (row >= height) {
+ for (unsigned int col=0; col<block; col++) {
+ *out++ = 0;
+ }
+ continue;
+ }
+
+ for (unsigned int col=0; col<block; col++) {
+ // Column out of range - pad a single entry
+ if (pos + col >= width) {
+ *out++ = 0;
+ continue;
+ }
+
+ if (integrate_sums) {
+ the_sums[row] += in[row][row_offset + pos + col];
+ }
+
+ *out++ = in[row][row_offset + pos + col];
+ }
+ }
+ }
+
+ if (integrate_sums) {
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
+
+ out = reinterpret_cast<TOut *>(out_int32 + int_by);
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut>
+inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
+ if (row_sum_multiplier) {
+ // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
+ // next block (post sums).
+ // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ out_int32 -= height;
+ for (unsigned int i=0; i<height; i++) {
+ out_int32[i] *= row_sum_multiplier;
+ }
+ } else {
+ // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
+ // sum block. We need to insert the (zero) sums, and advance 'out'.
+ int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
+
+ for (unsigned int i=0; i<height; i++) {
+ out_int32[i] = 0;
+ }
+
+ out_int32 += height;
+
+ out = reinterpret_cast<TOut *>(out_int32);
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen,
+ unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax,
+ const unsigned int k0, const unsigned int kmax, bool integrate_sums,
+ const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
+ // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
+ // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
+ // pointers and conditionally overriding the out of range ones.
+
+ // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
+ // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
+ // expensive in highly threaded scenarios.
+ const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+ // Figure out the starting position based on k0 (with rounded length)
+ unsigned int start_string = k0 / rounded_stringlen;
+ unsigned int start_stringpos = k0 % rounded_stringlen;
+
+ // Process blocks of 'height' height...
+ for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
+ // Height to process
+ unsigned int active_height = std::min(ymax - ybase, height);
+
+ // Track our progress through the various strings
+ unsigned int k_left = (kmax - k0);
+ unsigned int string = start_string;
+ unsigned int stringpos = start_stringpos;
+
+ bool first = true;
+
+ // Prepare to call 'interleave_block' above for each string encompassed by K range
+ while (k_left > 0) {
+ // Width to process - and the width we will generate (with padding)
+ unsigned int in_width = std::min(k_left, stringlen - stringpos);
+ unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
+
+ const TIn * const *row_base = ptr[string] + ybase;
+
+ // If not all rows are valid, copy the ones that are into local array (see above comment).
+ if (active_height < height) {
+ for (unsigned int i=0; i<active_height; i++) {
+ row_ptrs[i] = ptr[string][ybase + i];
+ }
+
+ row_base = row_ptrs;
+ }
+
+ // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
+ // much code. However, integrated sums make no sense for non-integral types and won't ever be
+ // requested. So put a type trait check here to avoid generating pointless code.
+ if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+ interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
+ } else {
+ interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
+ }
+
+ k_left -= out_width;
+ string++;
+ stringpos=0;
+ first=false;
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums) {
+ FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+ }
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen,
+ const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
+
+ // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+ const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+ for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
+ // How many of the rows are active - the rest will get padded in interleave_block.
+ unsigned int active_height = std::min(ymax - ybase, height);
+ bool first = true;
+
+ auto conv_rows = conv_cols.process_rows(ybase, active_height);
+
+ while (!conv_rows.finished()) {
+ unsigned int width, offset;
+
+ // Get next set of parameters
+ std::tie(width, offset) = conv_rows.next_block(row_ptrs);
+
+ // Perform the interleave
+ if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+ interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
+ } else {
+ interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
+ }
+
+ first=false;
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums) {
+ FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+ }
+ }
+}
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) {
+ const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
+
+ // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
+ const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
+
+ const unsigned int width=kmax-k0;
+
+ for (unsigned int y=y0; y<ymax; y+=height) {
+ for (unsigned int r=0; r<height; r++) {
+ row_ptrs[r] = in + ((y + r) * in_stride);
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
+ interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+ } else {
+ interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
+ }
+
+ if (std::is_integral<TOut>::value && integrate_sums) {
+ FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
+ }
+ }
+}
+
+#include "indirect-interleaves/list.hpp"
+
+/**** Instantiate needed implementations ****/
+
+/* AArch32 */
+#ifdef __arm__
+/* FP32 */
+/* NEON implementation (height 6) */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16 */
+#if __ARM_FP16_ARGS
+/* NEON implementation using FP32 kernel (height 6) */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif /* __ARM_FP16_ARGS */
+
+/* BF16 */
+/* NEON implementation using FP32 kernel */
+template void IndirectInterleave<6, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<6, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif
+
+/* AArch64 */
+#ifdef __aarch64__
+/* FP64 */
+/* NEON/SVE implementation (height 8) */
+template void IndirectInterleave<8, 1, VLType::None>(double *, const double * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(double *, const double *, size_t, const convolver<double> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(double *, const double *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP32 */
+/* NEON/SVE implementation (height 8) */
+template void IndirectInterleave<8, 1, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FMMLA */
+template void IndirectInterleave<8, 2, VLType::None>(float *, const float * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 2, VLType::None>(float *, const float *, size_t, const convolver<float> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 2, VLType::None>(float *, const float *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* FP16 */
+template void IndirectInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(__fp16 *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 1, VLType::None>(float *, const __fp16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, const convolver<__fp16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const __fp16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* BF16 */
+/* NEON/SVE BFDOT */
+template void IndirectInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 2, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(bfloat16 *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON/SVE using FP32 kernel */
+template void IndirectInterleave<8, 1, VLType::None>(float *, const bfloat16 * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, const convolver<bfloat16> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(float *, const bfloat16 *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* INT16 */
+template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, const convolver<int16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(int16_t *, const int16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, const convolver<uint16_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(uint16_t *, const uint16_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* INT8 */
+/* NEON SMLA/SMLAL (height 4, block 16) */
+template void IndirectInterleave<4, 16, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 16, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 4) */
+template void IndirectInterleave<8, 4, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* MMLA SMMLA (height 8, block 8) */
+template void IndirectInterleave<8, 8, VLType::None>(int8_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 8, VLType::None>(int8_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 1) */
+template void IndirectInterleave<8, 1, VLType::None>(int16_t *, const int8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, const convolver<int8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(int16_t *, const int8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SMLA/SMLAL (height 4, block 16) */
+template void IndirectInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void ConvolutionInterleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<4, 16, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON SDOT (height 8, block 4) */
+template void IndirectInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 4, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* MMLA SMMLA (height 8, block 8) */
+template void IndirectInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 8, VLType::None>(uint8_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+
+/* NEON 16-bit (height 8, block 1) */
+template void IndirectInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t * const * const *, unsigned int, unsigned int, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+template void ConvolutionInterleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, const convolver<uint8_t> &, unsigned int, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+template void Interleave<8, 1, VLType::None>(uint16_t *, const uint8_t *, size_t, unsigned int, unsigned int, unsigned int, unsigned int, bool, int32_t);
+#endif // __aarch64__
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp
new file mode 100644
index 0000000000..660577f0e3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect.hpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <cstddef>
+#include <cstdint>
+
+#include "convolution_parameters.hpp"
+#include "convolver.hpp"
+#include "utils.hpp"
+
+namespace arm_gemm {
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, unsigned int y0, unsigned int ymax, unsigned int k0, unsigned int kmax, bool, int32_t);
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t);
+
+template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut>
+void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool, int32_t);
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
index 0f0e5a7ed4..8bf8d8442e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12.hpp
@@ -30,9 +30,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, int, int);
+void a64_gemm_s16_asimd_8x12(const int16_t *, const int16_t *, int32_t *, int, int, int);
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
//
// This describes the characteristics of a family of kernels, in terms of
// the required interleave properties and the output block size.
@@ -40,7 +40,7 @@ void a64_gemm_s16_asimd_12x8(const int16_t *, const int16_t *, int32_t *, int, i
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class gemm_s16_12x8 {
+class cls_a64_gemm_s16_8x12 {
public:
typedef int16_t operand_type;
typedef int32_t result_type;
@@ -62,10 +62,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_s16_asimd_12x8;
+ kern_type kernel = a64_gemm_s16_asimd_8x12;
- gemm_s16_12x8(const CPUInfo *) { }
+ cls_a64_gemm_s16_8x12(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
index 7052f83a3d..a77938ffa7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s16_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s16_asimd_12x8(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_s16_asimd_8x12(const int16_t *Apanel, const int16_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K)
{
const int16_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
index 256acc4c65..b68a5f518a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_4x4.hpp
@@ -34,7 +34,7 @@ void a64_gemm_s8_4x4(const int8_t *, const int8_t *, int32_t *, int, int, int);
#include "arm_gemm.hpp"
-class gemm_s8_4x4 {
+class cls_a64_gemm_s8_4x4 {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -56,10 +56,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
kern_type kernel=a64_gemm_s8_4x4;
- gemm_s8_4x4(const CPUInfo *) { }
+ cls_a64_gemm_s8_4x4(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
index 0e294bfe8d..eee817e8e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12.hpp
@@ -32,11 +32,11 @@
namespace arm_gemm {
// Load the actual kernel
-void a64_gemm_s8_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
-void a64_gemm_s8_12x8_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_a55r1(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_gemm_s8_8x12_x1(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class gemm_s8_12x8 {
+class cls_a64_gemm_s8_8x12 {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,16 +58,17 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_s8_12x8;
+ kern_type kernel = a64_gemm_s8_8x12;
- gemm_s8_12x8(const CPUInfo *ci) {
+ cls_a64_gemm_s8_8x12(const CPUInfo *ci) {
auto mod = ci->get_cpu_model();
if (mod == CPUModel::A55r1) {
- kernel = a64_gemm_s8_12x8_a55r1;
+ kernel = a64_gemm_s8_8x12_a55r1;
} else if (mod == CPUModel::X1) {
- kernel = a64_gemm_s8_12x8_x1;
+ kernel = a64_gemm_s8_8x12_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
index ddd8124ec9..bb5226e093 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s8_12x8_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_s8_8x12_a55r1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
index a7abaed9e0..7bf36a5900 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s8_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
// We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
index 446fcf8707..afd2427b85 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_s8_8x12/x1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_s8_12x8_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_s8_8x12_x1(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
// We divide K by 4 because the sdot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
index b86204043c..e49ebbd84e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12.hpp
@@ -30,17 +30,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_gemm_u16_asimd_12x8(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
+void a64_gemm_u16_asimd_8x12(const uint16_t *, const uint16_t *, uint32_t *, int, int, int);
-// 12x8 SGEMM "strategy" class.
-//
-// This describes the characteristics of a family of kernels, in terms of
-// the required interleave properties and the output block size.
-//
-// All kernels in the family must share these characteristics. The actual
-// kernel to be used can be chosen at runtime, based on the CPU_type
-// structure.
-class gemm_u16_12x8 {
+class cls_a64_gemm_u16_8x12 {
public:
typedef uint16_t operand_type;
typedef uint32_t result_type;
@@ -62,10 +54,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 1, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_u16_asimd_12x8;
+ kern_type kernel = a64_gemm_u16_asimd_8x12;
- gemm_u16_12x8(const CPUInfo *) { }
+ cls_a64_gemm_u16_8x12(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
index 66f0b7c0ac..98da7830f0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u16_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u16_asimd_12x8(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
+void a64_gemm_u16_asimd_8x12(const uint16_t *Apanel, const uint16_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K)
{
const uint16_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
index 134007b74c..854b6751c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_4x4.hpp
@@ -32,7 +32,7 @@ namespace arm_gemm {
// Kernel definition
void a64_gemm_u8_4x4(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K);
-class gemm_u8_4x4 {
+class cls_a64_gemm_u8_4x4 {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -64,10 +64,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 4, 4, 16> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 4, 4, 16, true> transforms_quantized = {};
kern_type kernel = a64_gemm_u8_4x4;
- gemm_u8_4x4(const CPUInfo *) { }
+ cls_a64_gemm_u8_4x4(const CPUInfo *) { }
};
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
index c0990ecd57..256ba2e08c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12.hpp
@@ -30,11 +30,11 @@
namespace arm_gemm {
// Load the actual kernel
-void a64_gemm_u8_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-void a64_gemm_u8_12x8_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_a55r1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_gemm_u8_8x12_x1(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class gemm_u8_12x8 {
+class cls_a64_gemm_u8_8x12 {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -66,16 +66,17 @@ public:
// Use the standard fixed sized transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 4, true> transforms_quantized = {};
- kern_type kernel = a64_gemm_u8_12x8;
+ kern_type kernel = a64_gemm_u8_8x12;
- gemm_u8_12x8(const CPUInfo *ci) {
+ cls_a64_gemm_u8_8x12(const CPUInfo *ci) {
auto mod = ci->get_cpu_model();
if (mod == CPUModel::A55r1) {
- kernel = a64_gemm_u8_12x8_a55r1;
+ kernel = a64_gemm_u8_8x12_a55r1;
} else if (mod == CPUModel::X1) {
- kernel = a64_gemm_u8_12x8_x1;
+ kernel = a64_gemm_u8_8x12_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
index c9a8a8229c..63869c9fd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u8_12x8_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_gemm_u8_8x12_a55r1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, const int ablocks, const int bblocks, const int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
index 821e742f90..ff60cbc905 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/generic.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u8_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
// We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
index 7fac67354f..1c1196b7a6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemm_u8_8x12/x1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_gemm_u8_12x8_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_gemm_u8_8x12_x1(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
// We divide K by 4 because the udot instruction processes 4 elements at a time.
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
index b60401b70d..b53172509e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32.hpp
@@ -25,32 +25,26 @@
#ifdef __aarch64__
-
+#include "../performance_parameters.hpp"
#include "../std_transforms_fixed.hpp"
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_gemv_fp32_mla_32(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-class hybrid_fp32_mla_4x8
+class cls_a64_gemv_fp32_mla_32
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
+ typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
static unsigned int out_width()
{
- return 4;
+ return 32;
}
static constexpr unsigned int k_unroll()
@@ -73,14 +67,13 @@ public:
return true;
}
- StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 1, 32, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_fp32_mla_4x8;
+ kern_type kernel=a64_gemv_fp32_mla_32;
- hybrid_fp32_mla_4x8(const CPUInfo *)
+ cls_a64_gemv_fp32_mla_32(const CPUInfo *)
{
-
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
new file mode 100644
index 0000000000..a2af8d6d14
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_gemv_fp32_mla_32/generic.cpp
@@ -0,0 +1,1546 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_gemv_fp32_mla_32 (
+ const float *A_ptr, const float *B_ptr, float *output_ptr,
+ size_t N, size_t K,
+ const float *bias, Activation act, bool
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "add x22, %x[N], #0x3\n"
+ "mov x21, %x[bias]\n"
+ "lsr x22, x22, #0x2\n"
+ "1:" // Column loop
+ "cmp x22, #0x8\n"
+ "bge 85f\n"
+ "cmp x22, #0x6\n"
+ "bgt 73f\n"
+ "beq 61f\n"
+ "cmp x22, #0x4\n"
+ "bgt 49f\n"
+ "beq 37f\n"
+ "cmp x22, #0x2\n"
+ "bgt 25f\n"
+ "beq 13f\n"
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 2f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "add x21, x21, #0x10\n"
+ "b 3f\n"
+ "2:" // Width 1: no bias
+ "movi v24.16b, #0x0\n"
+ "3:" // Width 1: setup done
+ "cmp x20, #0x4\n"
+ "blt 6f\n"
+ "cmp x20, #0x8\n"
+ "blt 5f\n"
+ "4:" // Width 1: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v3.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q4, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v4.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 4b\n"
+ "5:" // Width 1: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q5, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v5.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v6.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v7.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "6:" // Width 1: Multiply loop: Main loop skip
+ "cbz x20, 8f\n"
+ "7:" // Width 1: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v9.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "cbnz x20, 7b\n"
+ "8:" // Width 1: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 9f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "9:" // Width 1: No activation
+ "cmp %x[N], #0x4\n"
+ "blt 10f\n"
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 12f\n"
+ "10:" // Width 1: Partial writeback
+ "tbz %x[N], #1, 11f\n"
+ "str d24, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 12f\n"
+ "st1 { v24.s }[2], [%x[output_ptr]]\n"
+ "b 12f\n"
+ "11:" // Width 1: Partial direct writeback: partial_1_0
+ "str s24, [%x[output_ptr], #0x0]\n"
+ "12:" // Width 1: Writeback done
+ "b 97f\n"
+ "13:" // Width 2
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 14f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "add x21, x21, #0x20\n"
+ "b 15f\n"
+ "14:" // Width 2: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "15:" // Width 2: setup done
+ "cmp x20, #0x4\n"
+ "blt 18f\n"
+ "cmp x20, #0x8\n"
+ "blt 17f\n"
+ "16:" // Width 2: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v3.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v4.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q5, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v5.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q6, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v6.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v7.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v8.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 16b\n"
+ "17:" // Width 2: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v9.4s, v0.s[0]\n"
+ "ldr q10, [%x[B_ptr], #0x10]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v25.4s, v10.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v11.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v12.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v13.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v14.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q15, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q16, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v16.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "18:" // Width 2: Multiply loop: Main loop skip
+ "cbz x20, 20f\n"
+ "19:" // Width 2: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q17, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v17.4s, v0.s[0]\n"
+ "ldr q18, [%x[B_ptr], #0x10]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v25.4s, v18.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "cbnz x20, 19b\n"
+ "20:" // Width 2: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 21f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "21:" // Width 2: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "cmp %x[N], #0x8\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "blt 22f\n"
+ "str q25, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 24f\n"
+ "22:" // Width 2: Partial writeback
+ "tbz %x[N], #1, 23f\n"
+ "str d25, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 24f\n"
+ "st1 { v25.s }[2], [%x[output_ptr]]\n"
+ "b 24f\n"
+ "23:" // Width 2: Partial direct writeback: partial_1_4
+ "tbz %x[N], #0, 24f\n"
+ "str s25, [%x[output_ptr], #0x0]\n"
+ "24:" // Width 2: Writeback done
+ "b 97f\n"
+ "25:" // Width 3
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 26f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "add x21, x21, #0x30\n"
+ "b 27f\n"
+ "26:" // Width 3: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "27:" // Width 3: setup done
+ "cmp x20, #0x4\n"
+ "blt 30f\n"
+ "cmp x20, #0x8\n"
+ "blt 29f\n"
+ "28:" // Width 3: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v4.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q5, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v5.4s, v0.s[1]\n"
+ "ldr q6, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v6.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v7.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v8.4s, v0.s[2]\n"
+ "ldr q9, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v9.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q10, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v10.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q11, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v11.4s, v0.s[3]\n"
+ "ldr q12, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v12.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 28b\n"
+ "29:" // Width 3: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v13.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v14.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v15.4s, v0.s[0]\n"
+ "ldr q16, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q17, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v17.4s, v0.s[1]\n"
+ "ldr q18, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v18.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q19, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v19.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q20, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v20.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v21.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q22, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v22.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q23, [%x[B_ptr], #0x10]\n"
+ "fmla v25.4s, v23.4s, v0.s[3]\n"
+ "ldr q1, [%x[B_ptr], #0x20]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v1.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "30:" // Width 3: Multiply loop: Main loop skip
+ "cbz x20, 32f\n"
+ "31:" // Width 3: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[0]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v3.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v4.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "cbnz x20, 31b\n"
+ "32:" // Width 3: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 33f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "33:" // Width 3: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "cmp %x[N], #0xc\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "blt 34f\n"
+ "str q26, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 36f\n"
+ "34:" // Width 3: Partial writeback
+ "tbz %x[N], #1, 35f\n"
+ "str d26, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 36f\n"
+ "st1 { v26.s }[2], [%x[output_ptr]]\n"
+ "b 36f\n"
+ "35:" // Width 3: Partial direct writeback: partial_1_8
+ "tbz %x[N], #0, 36f\n"
+ "str s26, [%x[output_ptr], #0x0]\n"
+ "36:" // Width 3: Writeback done
+ "b 97f\n"
+ "37:" // Width 4
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 38f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "add x21, x21, #0x40\n"
+ "b 39f\n"
+ "38:" // Width 4: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "39:" // Width 4: setup done
+ "cmp x20, #0x4\n"
+ "blt 42f\n"
+ "cmp x20, #0x8\n"
+ "blt 41f\n"
+ "40:" // Width 4: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q5, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v5.4s, v0.s[1]\n"
+ "ldr q6, [%x[B_ptr], #0x10]\n"
+ "ldr q7, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v6.4s, v0.s[1]\n"
+ "ldr q8, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v7.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v8.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v9.4s, v0.s[2]\n"
+ "ldr q10, [%x[B_ptr], #0x10]\n"
+ "ldr q11, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v10.4s, v0.s[2]\n"
+ "ldr q12, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v11.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v12.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v13.4s, v0.s[3]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v14.4s, v0.s[3]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v15.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v16.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "sub x20, x20, #0x4\n"
+ "cmp x20, #0x8\n"
+ "bge 40b\n"
+ "41:" // Width 4: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q17, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v17.4s, v0.s[0]\n"
+ "ldr q18, [%x[B_ptr], #0x10]\n"
+ "ldr q19, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v18.4s, v0.s[0]\n"
+ "ldr q20, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v19.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v20.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q21, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v21.4s, v0.s[1]\n"
+ "ldr q22, [%x[B_ptr], #0x10]\n"
+ "ldr q23, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v22.4s, v0.s[1]\n"
+ "ldr q1, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v23.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v1.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[2]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v3.4s, v0.s[2]\n"
+ "ldr q5, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v4.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v5.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v6.4s, v0.s[3]\n"
+ "ldr q7, [%x[B_ptr], #0x10]\n"
+ "ldr q8, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v7.4s, v0.s[3]\n"
+ "ldr q9, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v8.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v9.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "42:" // Width 4: Multiply loop: Main loop skip
+ "cbz x20, 44f\n"
+ "43:" // Width 4: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q10, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v10.4s, v0.s[0]\n"
+ "ldr q11, [%x[B_ptr], #0x10]\n"
+ "ldr q12, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v11.4s, v0.s[0]\n"
+ "ldr q13, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v12.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v27.4s, v13.4s, v0.s[0]\n"
+ "cbnz x20, 43b\n"
+ "44:" // Width 4: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 45f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "45:" // Width 4: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "cmp %x[N], #0x10\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "blt 46f\n"
+ "str q27, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 48f\n"
+ "46:" // Width 4: Partial writeback
+ "tbz %x[N], #1, 47f\n"
+ "str d27, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 48f\n"
+ "st1 { v27.s }[2], [%x[output_ptr]]\n"
+ "b 48f\n"
+ "47:" // Width 4: Partial direct writeback: partial_1_12
+ "tbz %x[N], #0, 48f\n"
+ "str s27, [%x[output_ptr], #0x0]\n"
+ "48:" // Width 4: Writeback done
+ "b 97f\n"
+ "49:" // Width 5
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 50f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "add x21, x21, #0x50\n"
+ "b 51f\n"
+ "50:" // Width 5: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "51:" // Width 5: setup done
+ "cmp x20, #0x4\n"
+ "blt 54f\n"
+ "cmp x20, #0x8\n"
+ "blt 53f\n"
+ "52:" // Width 5: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q7, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v6.4s, v0.s[1]\n"
+ "ldr q8, [%x[B_ptr], #0x20]\n"
+ "ldr q9, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v7.4s, v0.s[1]\n"
+ "ldr q10, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v8.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v9.4s, v0.s[1]\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v10.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v11.4s, v0.s[2]\n"
+ "ldr q13, [%x[B_ptr], #0x20]\n"
+ "ldr q14, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v12.4s, v0.s[2]\n"
+ "ldr q15, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v13.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v14.4s, v0.s[2]\n"
+ "ldr q16, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v15.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q17, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v16.4s, v0.s[3]\n"
+ "ldr q18, [%x[B_ptr], #0x20]\n"
+ "ldr q19, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v17.4s, v0.s[3]\n"
+ "ldr q20, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v18.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v19.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v20.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "bge 52b\n"
+ "53:" // Width 5: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q21, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v21.4s, v0.s[0]\n"
+ "ldr q22, [%x[B_ptr], #0x10]\n"
+ "ldr q23, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v22.4s, v0.s[0]\n"
+ "ldr q1, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v23.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x40]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v1.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v2.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v3.4s, v0.s[1]\n"
+ "ldr q5, [%x[B_ptr], #0x20]\n"
+ "ldr q6, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v4.4s, v0.s[1]\n"
+ "ldr q7, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v5.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v6.4s, v0.s[1]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v7.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v8.4s, v0.s[2]\n"
+ "ldr q10, [%x[B_ptr], #0x20]\n"
+ "ldr q11, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v9.4s, v0.s[2]\n"
+ "ldr q12, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v10.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v11.4s, v0.s[2]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v12.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v13.4s, v0.s[3]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v14.4s, v0.s[3]\n"
+ "ldr q17, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v15.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v16.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v17.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "54:" // Width 5: Multiply loop: Main loop skip
+ "cbz x20, 56f\n"
+ "55:" // Width 5: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q18, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v18.4s, v0.s[0]\n"
+ "ldr q19, [%x[B_ptr], #0x10]\n"
+ "ldr q20, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v19.4s, v0.s[0]\n"
+ "ldr q21, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v20.4s, v0.s[0]\n"
+ "ldr q22, [%x[B_ptr], #0x40]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v21.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v28.4s, v22.4s, v0.s[0]\n"
+ "cbnz x20, 55b\n"
+ "56:" // Width 5: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 57f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "57:" // Width 5: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "cmp %x[N], #0x14\n"
+ "add %x[output_ptr], %x[output_ptr], #0x40\n"
+ "blt 58f\n"
+ "str q28, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 60f\n"
+ "58:" // Width 5: Partial writeback
+ "tbz %x[N], #1, 59f\n"
+ "str d28, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 60f\n"
+ "st1 { v28.s }[2], [%x[output_ptr]]\n"
+ "b 60f\n"
+ "59:" // Width 5: Partial direct writeback: partial_1_16
+ "tbz %x[N], #0, 60f\n"
+ "str s28, [%x[output_ptr], #0x0]\n"
+ "60:" // Width 5: Writeback done
+ "b 97f\n"
+ "61:" // Width 6
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 62f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "ldr q29, [x21, #0x50]\n"
+ "add x21, x21, #0x60\n"
+ "b 63f\n"
+ "62:" // Width 6: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "63:" // Width 6: setup done
+ "cmp x20, #0x4\n"
+ "blt 66f\n"
+ "cmp x20, #0x8\n"
+ "blt 65f\n"
+ "64:" // Width 6: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "ldr q6, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v6.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v7.4s, v0.s[1]\n"
+ "ldr q9, [%x[B_ptr], #0x20]\n"
+ "ldr q10, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v8.4s, v0.s[1]\n"
+ "ldr q11, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v9.4s, v0.s[1]\n"
+ "ldr q12, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v10.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v11.4s, v0.s[1]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v12.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v13.4s, v0.s[2]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v14.4s, v0.s[2]\n"
+ "ldr q17, [%x[B_ptr], #0x40]\n"
+ "ldr q18, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v15.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v16.4s, v0.s[2]\n"
+ "ldr q19, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v17.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q20, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v18.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x20]\n"
+ "ldr q22, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v19.4s, v0.s[3]\n"
+ "ldr q23, [%x[B_ptr], #0x40]\n"
+ "ldr q1, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v20.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v21.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v27.4s, v22.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "fmla v28.4s, v23.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "fmla v29.4s, v1.4s, v0.s[3]\n"
+ "bge 64b\n"
+ "65:" // Width 6: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v2.4s, v0.s[0]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v4.4s, v0.s[0]\n"
+ "ldr q6, [%x[B_ptr], #0x40]\n"
+ "ldr q7, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v5.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v6.4s, v0.s[0]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v8.4s, v0.s[1]\n"
+ "ldr q10, [%x[B_ptr], #0x20]\n"
+ "ldr q11, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v9.4s, v0.s[1]\n"
+ "ldr q12, [%x[B_ptr], #0x40]\n"
+ "fmla v26.4s, v10.4s, v0.s[1]\n"
+ "ldr q13, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v11.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v12.4s, v0.s[1]\n"
+ "ldr q14, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q15, [%x[B_ptr], #0x10]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "ldr q16, [%x[B_ptr], #0x20]\n"
+ "ldr q17, [%x[B_ptr], #0x30]\n"
+ "fmla v25.4s, v15.4s, v0.s[2]\n"
+ "ldr q18, [%x[B_ptr], #0x40]\n"
+ "ldr q19, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v16.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v17.4s, v0.s[2]\n"
+ "ldr q20, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v18.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q21, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v19.4s, v0.s[2]\n"
+ "ldr q22, [%x[B_ptr], #0x20]\n"
+ "ldr q23, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v20.4s, v0.s[3]\n"
+ "ldr q1, [%x[B_ptr], #0x40]\n"
+ "ldr q2, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v21.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v26.4s, v22.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v27.4s, v23.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "fmla v28.4s, v1.4s, v0.s[3]\n"
+ "fmla v29.4s, v2.4s, v0.s[3]\n"
+ "66:" // Width 6: Multiply loop: Main loop skip
+ "cbz x20, 68f\n"
+ "67:" // Width 6: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v3.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "ldr q5, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v4.4s, v0.s[0]\n"
+ "ldr q6, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v5.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x40]\n"
+ "ldr q8, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v6.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v28.4s, v7.4s, v0.s[0]\n"
+ "fmla v29.4s, v8.4s, v0.s[0]\n"
+ "cbnz x20, 67b\n"
+ "68:" // Width 6: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "69:" // Width 6: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "str q28, [%x[output_ptr], #0x40]\n"
+ "cmp %x[N], #0x18\n"
+ "add %x[output_ptr], %x[output_ptr], #0x50\n"
+ "blt 70f\n"
+ "str q29, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 72f\n"
+ "70:" // Width 6: Partial writeback
+ "tbz %x[N], #1, 71f\n"
+ "str d29, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 72f\n"
+ "st1 { v29.s }[2], [%x[output_ptr]]\n"
+ "b 72f\n"
+ "71:" // Width 6: Partial direct writeback: partial_1_20
+ "tbz %x[N], #0, 72f\n"
+ "str s29, [%x[output_ptr], #0x0]\n"
+ "72:" // Width 6: Writeback done
+ "b 97f\n"
+ "73:" // Width 7
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 74f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "ldr q29, [x21, #0x50]\n"
+ "ldr q30, [x21, #0x60]\n"
+ "add x21, x21, #0x70\n"
+ "b 75f\n"
+ "74:" // Width 7: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "75:" // Width 7: setup done
+ "cmp x20, #0x4\n"
+ "blt 78f\n"
+ "cmp x20, #0x8\n"
+ "blt 77f\n"
+ "76:" // Width 7: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "ldr q6, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v6.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q8, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v7.4s, v0.s[0]\n"
+ "ldr q9, [%x[B_ptr], #0x10]\n"
+ "ldr q10, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v8.4s, v0.s[1]\n"
+ "ldr q11, [%x[B_ptr], #0x30]\n"
+ "ldr q12, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v9.4s, v0.s[1]\n"
+ "ldr q13, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v10.4s, v0.s[1]\n"
+ "ldr q14, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v11.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v12.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q15, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v13.4s, v0.s[1]\n"
+ "ldr q16, [%x[B_ptr], #0x10]\n"
+ "ldr q17, [%x[B_ptr], #0x20]\n"
+ "fmla v30.4s, v14.4s, v0.s[1]\n"
+ "ldr q18, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v15.4s, v0.s[2]\n"
+ "ldr q19, [%x[B_ptr], #0x40]\n"
+ "ldr q20, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v16.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v17.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v18.4s, v0.s[2]\n"
+ "ldr q22, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v19.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q23, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v20.4s, v0.s[2]\n"
+ "ldr q1, [%x[B_ptr], #0x20]\n"
+ "ldr q2, [%x[B_ptr], #0x30]\n"
+ "fmla v30.4s, v21.4s, v0.s[2]\n"
+ "ldr q3, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v22.4s, v0.s[3]\n"
+ "ldr q4, [%x[B_ptr], #0x50]\n"
+ "ldr q5, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v23.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v1.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v2.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v3.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "sub x20, x20, #0x4\n"
+ "fmla v29.4s, v4.4s, v0.s[3]\n"
+ "cmp x20, #0x8\n"
+ "fmla v30.4s, v5.4s, v0.s[3]\n"
+ "bge 76b\n"
+ "77:" // Width 7: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q6, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x10]\n"
+ "ldr q8, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v7.4s, v0.s[0]\n"
+ "ldr q9, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v8.4s, v0.s[0]\n"
+ "ldr q10, [%x[B_ptr], #0x40]\n"
+ "ldr q11, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v9.4s, v0.s[0]\n"
+ "ldr q12, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v28.4s, v10.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v11.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q13, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v12.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x10]\n"
+ "ldr q15, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "ldr q16, [%x[B_ptr], #0x30]\n"
+ "ldr q17, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v14.4s, v0.s[1]\n"
+ "ldr q18, [%x[B_ptr], #0x50]\n"
+ "fmla v26.4s, v15.4s, v0.s[1]\n"
+ "ldr q19, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v27.4s, v16.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v17.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q20, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v18.4s, v0.s[1]\n"
+ "ldr q21, [%x[B_ptr], #0x10]\n"
+ "ldr q22, [%x[B_ptr], #0x20]\n"
+ "fmla v30.4s, v19.4s, v0.s[1]\n"
+ "ldr q23, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v20.4s, v0.s[2]\n"
+ "ldr q1, [%x[B_ptr], #0x40]\n"
+ "ldr q2, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v21.4s, v0.s[2]\n"
+ "ldr q3, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v22.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v23.4s, v0.s[2]\n"
+ "ldr q4, [%x[B_ptr], #0x0]\n"
+ "fmla v28.4s, v1.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q5, [%x[B_ptr], #0x10]\n"
+ "fmla v29.4s, v2.4s, v0.s[2]\n"
+ "ldr q6, [%x[B_ptr], #0x20]\n"
+ "ldr q7, [%x[B_ptr], #0x30]\n"
+ "fmla v30.4s, v3.4s, v0.s[2]\n"
+ "ldr q8, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v4.4s, v0.s[3]\n"
+ "ldr q9, [%x[B_ptr], #0x50]\n"
+ "ldr q10, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v5.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v26.4s, v6.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v27.4s, v7.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v28.4s, v8.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "fmla v29.4s, v9.4s, v0.s[3]\n"
+ "fmla v30.4s, v10.4s, v0.s[3]\n"
+ "78:" // Width 7: Multiply loop: Main loop skip
+ "cbz x20, 80f\n"
+ "79:" // Width 7: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v11.4s, v0.s[0]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "ldr q13, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v12.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v13.4s, v0.s[0]\n"
+ "ldr q15, [%x[B_ptr], #0x40]\n"
+ "ldr q16, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v14.4s, v0.s[0]\n"
+ "ldr q17, [%x[B_ptr], #0x60]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "fmla v28.4s, v15.4s, v0.s[0]\n"
+ "fmla v29.4s, v16.4s, v0.s[0]\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v30.4s, v17.4s, v0.s[0]\n"
+ "cbnz x20, 79b\n"
+ "80:" // Width 7: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 81f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "81:" // Width 7: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "str q28, [%x[output_ptr], #0x40]\n"
+ "str q29, [%x[output_ptr], #0x50]\n"
+ "cmp %x[N], #0x1c\n"
+ "add %x[output_ptr], %x[output_ptr], #0x60\n"
+ "blt 82f\n"
+ "str q30, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 84f\n"
+ "82:" // Width 7: Partial writeback
+ "tbz %x[N], #1, 83f\n"
+ "str d30, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 84f\n"
+ "st1 { v30.s }[2], [%x[output_ptr]]\n"
+ "b 84f\n"
+ "83:" // Width 7: Partial direct writeback: partial_1_24
+ "tbz %x[N], #0, 84f\n"
+ "str s30, [%x[output_ptr], #0x0]\n"
+ "84:" // Width 7: Writeback done
+ "b 97f\n"
+ "85:" // Width 8
+ "mov x20, %x[K]\n"
+ "mov x19, %x[A_ptr]\n"
+ "cbz x21, 86f\n"
+ "ldr q24, [x21, #0x0]\n"
+ "ldr q25, [x21, #0x10]\n"
+ "ldr q26, [x21, #0x20]\n"
+ "ldr q27, [x21, #0x30]\n"
+ "ldr q28, [x21, #0x40]\n"
+ "ldr q29, [x21, #0x50]\n"
+ "ldr q30, [x21, #0x60]\n"
+ "ldr q31, [x21, #0x70]\n"
+ "add x21, x21, #0x80\n"
+ "b 87f\n"
+ "86:" // Width 8: no bias
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "87:" // Width 8: setup done
+ "cmp x20, #0x4\n"
+ "blt 90f\n"
+ "cmp x20, #0x8\n"
+ "blt 89f\n"
+ "88:" // Width 8: Multiply loop: Main loop head
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q1, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v1.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x10]\n"
+ "ldr q3, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v2.4s, v0.s[0]\n"
+ "ldr q4, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v3.4s, v0.s[0]\n"
+ "ldr q5, [%x[B_ptr], #0x40]\n"
+ "ldr q6, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v4.4s, v0.s[0]\n"
+ "ldr q7, [%x[B_ptr], #0x60]\n"
+ "ldr q8, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v5.4s, v0.s[0]\n"
+ "fmla v29.4s, v6.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v30.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q9, [%x[B_ptr], #0x0]\n"
+ "fmla v31.4s, v8.4s, v0.s[0]\n"
+ "ldr q10, [%x[B_ptr], #0x10]\n"
+ "ldr q11, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "ldr q12, [%x[B_ptr], #0x30]\n"
+ "ldr q13, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v10.4s, v0.s[1]\n"
+ "fmla v26.4s, v11.4s, v0.s[1]\n"
+ "ldr q14, [%x[B_ptr], #0x50]\n"
+ "ldr q15, [%x[B_ptr], #0x60]\n"
+ "fmla v27.4s, v12.4s, v0.s[1]\n"
+ "ldr q16, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v13.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v14.4s, v0.s[1]\n"
+ "ldr q17, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v15.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q18, [%x[B_ptr], #0x10]\n"
+ "fmla v31.4s, v16.4s, v0.s[1]\n"
+ "ldr q19, [%x[B_ptr], #0x20]\n"
+ "ldr q20, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v17.4s, v0.s[2]\n"
+ "ldr q21, [%x[B_ptr], #0x40]\n"
+ "ldr q22, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v18.4s, v0.s[2]\n"
+ "ldr q23, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v19.4s, v0.s[2]\n"
+ "ldr q1, [%x[B_ptr], #0x70]\n"
+ "fmla v27.4s, v20.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v21.4s, v0.s[2]\n"
+ "ldr q2, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v22.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q3, [%x[B_ptr], #0x10]\n"
+ "fmla v30.4s, v23.4s, v0.s[2]\n"
+ "ldr q4, [%x[B_ptr], #0x20]\n"
+ "ldr q5, [%x[B_ptr], #0x30]\n"
+ "fmla v31.4s, v1.4s, v0.s[2]\n"
+ "ldr q6, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v2.4s, v0.s[3]\n"
+ "ldr q7, [%x[B_ptr], #0x50]\n"
+ "ldr q8, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v3.4s, v0.s[3]\n"
+ "ldr q9, [%x[B_ptr], #0x70]\n"
+ "fmla v26.4s, v4.4s, v0.s[3]\n"
+ "fmla v27.4s, v5.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v6.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v29.4s, v7.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x20, x20, #0x4\n"
+ "fmla v30.4s, v8.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "cmp x20, #0x8\n"
+ "fmla v31.4s, v9.4s, v0.s[3]\n"
+ "bge 88b\n"
+ "89:" // Width 8: Multiply loop: Single iteration only
+ "sub x20, x20, #0x4\n"
+ "ldr q0, [x19, #0x0]\n"
+ "ldr q10, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v10.4s, v0.s[0]\n"
+ "ldr q11, [%x[B_ptr], #0x10]\n"
+ "ldr q12, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v11.4s, v0.s[0]\n"
+ "ldr q13, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v12.4s, v0.s[0]\n"
+ "ldr q14, [%x[B_ptr], #0x40]\n"
+ "ldr q15, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v13.4s, v0.s[0]\n"
+ "ldr q16, [%x[B_ptr], #0x60]\n"
+ "ldr q17, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v14.4s, v0.s[0]\n"
+ "fmla v29.4s, v15.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v30.4s, v16.4s, v0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q18, [%x[B_ptr], #0x0]\n"
+ "fmla v31.4s, v17.4s, v0.s[0]\n"
+ "ldr q19, [%x[B_ptr], #0x10]\n"
+ "ldr q20, [%x[B_ptr], #0x20]\n"
+ "fmla v24.4s, v18.4s, v0.s[1]\n"
+ "ldr q21, [%x[B_ptr], #0x30]\n"
+ "ldr q22, [%x[B_ptr], #0x40]\n"
+ "fmla v25.4s, v19.4s, v0.s[1]\n"
+ "fmla v26.4s, v20.4s, v0.s[1]\n"
+ "ldr q23, [%x[B_ptr], #0x50]\n"
+ "ldr q1, [%x[B_ptr], #0x60]\n"
+ "fmla v27.4s, v21.4s, v0.s[1]\n"
+ "ldr q2, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v22.4s, v0.s[1]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v29.4s, v23.4s, v0.s[1]\n"
+ "ldr q3, [%x[B_ptr], #0x0]\n"
+ "fmla v30.4s, v1.4s, v0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q4, [%x[B_ptr], #0x10]\n"
+ "fmla v31.4s, v2.4s, v0.s[1]\n"
+ "ldr q5, [%x[B_ptr], #0x20]\n"
+ "ldr q6, [%x[B_ptr], #0x30]\n"
+ "fmla v24.4s, v3.4s, v0.s[2]\n"
+ "ldr q7, [%x[B_ptr], #0x40]\n"
+ "ldr q8, [%x[B_ptr], #0x50]\n"
+ "fmla v25.4s, v4.4s, v0.s[2]\n"
+ "ldr q9, [%x[B_ptr], #0x60]\n"
+ "fmla v26.4s, v5.4s, v0.s[2]\n"
+ "ldr q10, [%x[B_ptr], #0x70]\n"
+ "fmla v27.4s, v6.4s, v0.s[2]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v7.4s, v0.s[2]\n"
+ "ldr q11, [%x[B_ptr], #0x0]\n"
+ "fmla v29.4s, v8.4s, v0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ldr q12, [%x[B_ptr], #0x10]\n"
+ "fmla v30.4s, v9.4s, v0.s[2]\n"
+ "ldr q13, [%x[B_ptr], #0x20]\n"
+ "ldr q14, [%x[B_ptr], #0x30]\n"
+ "fmla v31.4s, v10.4s, v0.s[2]\n"
+ "ldr q15, [%x[B_ptr], #0x40]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "ldr q16, [%x[B_ptr], #0x50]\n"
+ "ldr q17, [%x[B_ptr], #0x60]\n"
+ "fmla v25.4s, v12.4s, v0.s[3]\n"
+ "ldr q18, [%x[B_ptr], #0x70]\n"
+ "fmla v26.4s, v13.4s, v0.s[3]\n"
+ "fmla v27.4s, v14.4s, v0.s[3]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla v28.4s, v15.4s, v0.s[3]\n"
+ "add x19, x19, #0x10\n"
+ "fmla v29.4s, v16.4s, v0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla v30.4s, v17.4s, v0.s[3]\n"
+ "prfm pldl1keep, [x19, #0x80]\n"
+ "fmla v31.4s, v18.4s, v0.s[3]\n"
+ "90:" // Width 8: Multiply loop: Main loop skip
+ "cbz x20, 92f\n"
+ "91:" // Width 8: Multiply loop: Odd block loop
+ "ldr s0, [x19], #0x4\n"
+ "ldr q19, [%x[B_ptr], #0x0]\n"
+ "fmla v24.4s, v19.4s, v0.s[0]\n"
+ "ldr q20, [%x[B_ptr], #0x10]\n"
+ "ldr q21, [%x[B_ptr], #0x20]\n"
+ "fmla v25.4s, v20.4s, v0.s[0]\n"
+ "ldr q22, [%x[B_ptr], #0x30]\n"
+ "fmla v26.4s, v21.4s, v0.s[0]\n"
+ "ldr q23, [%x[B_ptr], #0x40]\n"
+ "ldr q1, [%x[B_ptr], #0x50]\n"
+ "fmla v27.4s, v22.4s, v0.s[0]\n"
+ "ldr q2, [%x[B_ptr], #0x60]\n"
+ "ldr q3, [%x[B_ptr], #0x70]\n"
+ "fmla v28.4s, v23.4s, v0.s[0]\n"
+ "fmla v29.4s, v1.4s, v0.s[0]\n"
+ "add %x[B_ptr], %x[B_ptr], #0x80\n"
+ "sub x20, x20, #0x1\n"
+ "fmla v30.4s, v2.4s, v0.s[0]\n"
+ "fmla v31.4s, v3.4s, v0.s[0]\n"
+ "cbnz x20, 91b\n"
+ "92:" // Width 8: Multiply loop: No odd multiplies
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 93f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v16.4s\n"
+ "fmax v31.4s, v31.4s, v17.4s\n"
+ "93:" // Width 8: No activation
+ "str q24, [%x[output_ptr], #0x0]\n"
+ "str q25, [%x[output_ptr], #0x10]\n"
+ "str q26, [%x[output_ptr], #0x20]\n"
+ "str q27, [%x[output_ptr], #0x30]\n"
+ "str q28, [%x[output_ptr], #0x40]\n"
+ "str q29, [%x[output_ptr], #0x50]\n"
+ "str q30, [%x[output_ptr], #0x60]\n"
+ "cmp %x[N], #0x20\n"
+ "add %x[output_ptr], %x[output_ptr], #0x70\n"
+ "blt 94f\n"
+ "str q31, [%x[output_ptr], #0x0]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x10\n"
+ "b 96f\n"
+ "94:" // Width 8: Partial writeback
+ "tbz %x[N], #1, 95f\n"
+ "str d31, [%x[output_ptr]], #0x8\n"
+ "tbz %x[N], #0, 96f\n"
+ "st1 { v31.s }[2], [%x[output_ptr]]\n"
+ "b 96f\n"
+ "95:" // Width 8: Partial direct writeback: partial_1_28
+ "tbz %x[N], #0, 96f\n"
+ "str s31, [%x[output_ptr], #0x0]\n"
+ "96:" // Width 8: Writeback done
+ "subs x22, x22, #0x8\n"
+ "sub %x[N], %x[N], #0x20\n"
+ "bgt 1b\n"
+ "97:" // Exit
+
+ : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+ : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x19", "x20", "x21", "x22"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
index 79cae6002a..73fb5b7122 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24.hpp
@@ -30,15 +30,15 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_hgemm_asimd_24x8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-void a64_hgemm_asimd_24x8_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void a64_hgemm_asimd_8x24_x1(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-// 24x8 HGEMM "strategy" class. Describes the kernel properties.
+// 8x24 HGEMM "strategy" class. Describes the kernel properties.
//
// The generic "gemm_opt" function will instantiate one of these (allowing
// the constructor to pick a kernel implementation).
-class hgemm_24x8 {
+class cls_a64_hgemm_8x24 {
public:
typedef __fp16 operand_type;
typedef __fp16 result_type;
@@ -62,15 +62,15 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 24> transforms = {};
// Default to the generic kernel
- kern_type kernel = a64_hgemm_asimd_24x8;
+ kern_type kernel = a64_hgemm_asimd_8x24;
- hgemm_24x8(const CPUInfo *ci) {
+ cls_a64_hgemm_8x24(const CPUInfo *ci) {
auto model = ci->get_cpu_model();
if (model == CPUModel::A55r1) {
- kernel = a64_hgemm_asimd_24x8_a55r1;
+ kernel = a64_hgemm_asimd_8x24_a55r1;
} else if (model == CPUModel::X1) {
- kernel = a64_hgemm_asimd_24x8_x1;
+ kernel = a64_hgemm_asimd_8x24_x1;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
index 829ae30001..29cdd33893 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/a55r1.cpp
@@ -41,7 +41,7 @@
namespace arm_gemm {
-void a64_hgemm_asimd_24x8_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_a55r1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
index 657fade944..c9c48dd1c0 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/generic.cpp
@@ -34,14 +34,14 @@
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
//
// Note that the intent of this is that either ablocks or bblocks will be 1
// - this construction allows the output loop to proceed in either order.
namespace arm_gemm {
-void a64_hgemm_asimd_24x8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
index 3bb8334126..a6d2405e7e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_24x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hgemm_8x24/x1.cpp
@@ -34,14 +34,14 @@
// Assume that "Apanel" points to a chunk of A blocks (each size 8xK) in read-order.
// Assume that "Bpanel" points to a chunk of B blocks (each size 24xK) in read-order.
// Assume that "Cpanel" points to a chunk of C output blocks (each size
-// 24x8), the chunks being arranged in a row major fashion.
+// 8x24), the chunks being arranged in a row major fashion.
//
// Note that the intent of this is that either ablocks or bblocks will be 1
// - this construction allows the output loop to proceed in either order.
namespace arm_gemm {
-void a64_hgemm_asimd_24x8_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void a64_hgemm_asimd_8x24_x1(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
new file mode 100644
index 0000000000..a76c9949de
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16.hpp
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<bfloat16>, \
+ size_t, size_t, \
+ const bfloat16 *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_bf16fp32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_bf16fp32_dot_6x16
+{
+public:
+ typedef bfloat16 operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 2;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 2> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_bf16fp32_dot_6x16;
+
+ cls_a64_hybrid_bf16fp32_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..be680ed645
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_bf16fp32_dot_6x16/generic.cpp
@@ -0,0 +1,3668 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_bf16fp32_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+ size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const bfloat16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 186f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 149f\n"
+ "beq 112f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 75f\n"
+ "beq 38f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x14, 4f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "b 15f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 14f\n"
+ "cmp x16, #0x10\n"
+ "bge 13f\n"
+ "tbz x16, #3, 8f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 6f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 5f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 12f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 12f\n"
+ "6:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x16, #1, 7f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 12f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 12f\n"
+ "8:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x16, #2, 10f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 9f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 12f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 12f\n"
+ "10:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x16, #1, 11f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 12f\n"
+ "11:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "12:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 15f\n"
+ "13:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 15f\n"
+ "14:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "15:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "16:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 17f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 18f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 18f\n"
+ "17:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "18:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "blt 21f\n"
+ "cmp x11, #0x10\n"
+ "blt 20f\n"
+ "19:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "bge 19b\n"
+ "20:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ "21:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 26f\n"
+ "cmp x11, #0x2\n"
+ "blt 23f\n"
+ "22:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "cmp x11, #0x2\n"
+ "bge 22b\n"
+ "cbz x11, 26f\n"
+ "23:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 24f\n"
+ "ldr s0, [x10], #0x4\n"
+ "tbz x11, #0, 25f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "b 25f\n"
+ "24:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "25:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ "26:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 16b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "27:" // Height 1: No activation
+ "cmp x16, #0x10\n"
+ "bge 36f\n"
+ "tbz x16, #3, 31f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 29f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 28f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 35f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 35f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 35f\n"
+ "29:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 30f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 35f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 35f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 35f\n"
+ "31:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 33f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 32f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 35f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 35f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 35f\n"
+ "33:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 34f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x16, #0, 35f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 35f\n"
+ "34:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "35:" // Height 1: Partial direct writeback: Done
+ "b 37f\n"
+ "36:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "37:" // Height 1: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 3b\n"
+ "b 224f\n"
+ "38:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 39f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 40f\n"
+ "39:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "40:" // Height 2: Column loop
+ "cbz x14, 41f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v13.16b, v9.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v14.16b, v10.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "b 52f\n"
+ "41:" // Height 2: no bias
+ "tbz %x[flags], #0, 51f\n"
+ "cmp x16, #0x10\n"
+ "bge 50f\n"
+ "tbz x16, #3, 45f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 43f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 42f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 49f\n"
+ "42:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 49f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 49f\n"
+ "43:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x16, #1, 44f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 49f\n"
+ "44:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 49f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 49f\n"
+ "45:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x16, #2, 47f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 46f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 49f\n"
+ "46:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 49f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 49f\n"
+ "47:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x16, #1, 48f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 49f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 49f\n"
+ "48:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "49:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 52f\n"
+ "50:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 52f\n"
+ "51:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "52:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "53:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 54f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 55f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 55f\n"
+ "54:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "55:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "blt 58f\n"
+ "cmp x11, #0x10\n"
+ "blt 57f\n"
+ "56:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "bge 56b\n"
+ "57:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ "58:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 63f\n"
+ "cmp x11, #0x2\n"
+ "blt 60f\n"
+ "59:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "bge 59b\n"
+ "cbz x11, 63f\n"
+ "60:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 61f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "tbz x11, #0, 62f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "b 62f\n"
+ "61:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "62:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ "63:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 53b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 64f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "64:" // Height 2: No activation
+ "cmp x16, #0x10\n"
+ "bge 73f\n"
+ "tbz x16, #3, 68f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 66f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 65f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 72f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 72f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 72f\n"
+ "66:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 67f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 72f\n"
+ "67:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 72f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 72f\n"
+ "68:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 70f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 69f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 72f\n"
+ "69:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 72f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 72f\n"
+ "70:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 71f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x16, #0, 72f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 72f\n"
+ "71:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "72:" // Height 2: Partial direct writeback: Done
+ "b 74f\n"
+ "73:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "74:" // Height 2: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 40b\n"
+ "b 224f\n"
+ "75:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 76f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 77f\n"
+ "76:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "77:" // Height 3: Column loop
+ "cbz x14, 78f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 89f\n"
+ "78:" // Height 3: no bias
+ "tbz %x[flags], #0, 88f\n"
+ "cmp x16, #0x10\n"
+ "bge 87f\n"
+ "tbz x16, #3, 82f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 80f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 79f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 86f\n"
+ "79:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 86f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 86f\n"
+ "80:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x16, #1, 81f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 86f\n"
+ "81:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 86f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 86f\n"
+ "82:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x16, #2, 84f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 83f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 86f\n"
+ "83:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 86f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 86f\n"
+ "84:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x16, #1, 85f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 86f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 86f\n"
+ "85:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "86:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 89f\n"
+ "87:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 89f\n"
+ "88:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "89:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "90:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 91f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 92f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 92f\n"
+ "91:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "92:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "blt 95f\n"
+ "cmp x11, #0x10\n"
+ "blt 94f\n"
+ "93:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ "bge 93b\n"
+ "94:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ "95:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 100f\n"
+ "cmp x11, #0x2\n"
+ "blt 97f\n"
+ "96:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "bge 96b\n"
+ "cbz x11, 100f\n"
+ "97:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 98f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "tbz x11, #0, 99f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "b 99f\n"
+ "98:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "99:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ "100:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 90b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 101f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "101:" // Height 3: No activation
+ "cmp x16, #0x10\n"
+ "bge 110f\n"
+ "tbz x16, #3, 105f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 103f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 102f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 109f\n"
+ "102:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 109f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 109f\n"
+ "103:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 104f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 109f\n"
+ "104:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 109f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 109f\n"
+ "105:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 107f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 106f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 109f\n"
+ "106:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 109f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 109f\n"
+ "107:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 108f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x16, #0, 109f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 109f\n"
+ "108:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "109:" // Height 3: Partial direct writeback: Done
+ "b 111f\n"
+ "110:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "111:" // Height 3: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 77b\n"
+ "b 224f\n"
+ "112:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 113f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 114f\n"
+ "113:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "114:" // Height 4: Column loop
+ "cbz x14, 115f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 126f\n"
+ "115:" // Height 4: no bias
+ "tbz %x[flags], #0, 125f\n"
+ "cmp x16, #0x10\n"
+ "bge 124f\n"
+ "tbz x16, #3, 119f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 117f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 116f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 123f\n"
+ "116:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 123f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 123f\n"
+ "117:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x16, #1, 118f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 123f\n"
+ "118:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 123f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 123f\n"
+ "119:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x16, #2, 121f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 120f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 123f\n"
+ "120:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 123f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 123f\n"
+ "121:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x16, #1, 122f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 123f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 123f\n"
+ "122:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "123:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 126f\n"
+ "124:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 126f\n"
+ "125:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "126:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "127:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 128f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 129f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 129f\n"
+ "128:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "129:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "blt 132f\n"
+ "cmp x11, #0x10\n"
+ "blt 131f\n"
+ "130:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ "bge 130b\n"
+ "131:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ "132:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 137f\n"
+ "cmp x11, #0x2\n"
+ "blt 134f\n"
+ "133:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "bge 133b\n"
+ "cbz x11, 137f\n"
+ "134:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 135f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "tbz x11, #0, 136f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "ld1 { v3.h }[2], [x24]\n"
+ "b 136f\n"
+ "135:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "ldr h3, [x24, #0x0]\n"
+ "136:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ "137:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 127b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 138f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "138:" // Height 4: No activation
+ "cmp x16, #0x10\n"
+ "bge 147f\n"
+ "tbz x16, #3, 142f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 140f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 139f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 146f\n"
+ "139:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 146f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 146f\n"
+ "140:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 141f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 146f\n"
+ "141:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 146f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 146f\n"
+ "142:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 144f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 143f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 146f\n"
+ "143:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 146f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 146f\n"
+ "144:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 145f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x16, #0, 146f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 146f\n"
+ "145:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "146:" // Height 4: Partial direct writeback: Done
+ "b 148f\n"
+ "147:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "148:" // Height 4: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 114b\n"
+ "b 224f\n"
+ "149:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 150f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 151f\n"
+ "150:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "151:" // Height 5: Column loop
+ "cbz x14, 152f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 163f\n"
+ "152:" // Height 5: no bias
+ "tbz %x[flags], #0, 162f\n"
+ "cmp x16, #0x10\n"
+ "bge 161f\n"
+ "tbz x16, #3, 156f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 154f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 153f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 160f\n"
+ "153:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 160f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 160f\n"
+ "154:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x16, #1, 155f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 160f\n"
+ "155:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 160f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 160f\n"
+ "156:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x16, #2, 158f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 157f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 160f\n"
+ "157:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 160f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 160f\n"
+ "158:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x16, #1, 159f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 160f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 160f\n"
+ "159:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "160:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 163f\n"
+ "161:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 163f\n"
+ "162:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "163:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "164:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 165f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 166f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 166f\n"
+ "165:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "166:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "blt 169f\n"
+ "cmp x11, #0x10\n"
+ "blt 168f\n"
+ "167:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ "bge 167b\n"
+ "168:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ "169:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 174f\n"
+ "cmp x11, #0x2\n"
+ "blt 171f\n"
+ "170:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "bge 170b\n"
+ "cbz x11, 174f\n"
+ "171:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 172f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "tbz x11, #0, 173f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "ld1 { v3.h }[2], [x24]\n"
+ "ld1 { v4.h }[2], [x22]\n"
+ "b 173f\n"
+ "172:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "ldr h3, [x24, #0x0]\n"
+ "ldr h4, [x22, #0x0]\n"
+ "173:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ "174:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 164b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 175f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "175:" // Height 5: No activation
+ "cmp x16, #0x10\n"
+ "bge 184f\n"
+ "tbz x16, #3, 179f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 177f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 176f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 183f\n"
+ "176:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 183f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 183f\n"
+ "177:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 178f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 183f\n"
+ "178:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 183f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 183f\n"
+ "179:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 181f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 180f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 183f\n"
+ "180:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 183f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 183f\n"
+ "181:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 182f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x16, #0, 183f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 183f\n"
+ "182:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "183:" // Height 5: Partial direct writeback: Done
+ "b 185f\n"
+ "184:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "185:" // Height 5: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 151b\n"
+ "b 224f\n"
+ "186:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 187f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 188f\n"
+ "187:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "188:" // Height 6: Column loop
+ "cbz x14, 189f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 200f\n"
+ "189:" // Height 6: no bias
+ "tbz %x[flags], #0, 199f\n"
+ "cmp x16, #0x10\n"
+ "bge 198f\n"
+ "tbz x16, #3, 193f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 191f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 190f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 197f\n"
+ "190:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 197f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 197f\n"
+ "191:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x16, #1, 192f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 197f\n"
+ "192:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 197f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 197f\n"
+ "193:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x16, #2, 195f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 194f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 197f\n"
+ "194:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 197f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 197f\n"
+ "195:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x16, #1, 196f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 197f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 197f\n"
+ "196:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "197:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 200f\n"
+ "198:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 200f\n"
+ "199:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "200:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "201:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 202f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 203f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 203f\n"
+ "202:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "203:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "blt 206f\n"
+ "cmp x11, #0x10\n"
+ "blt 205f\n"
+ "204:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n"
+ "bge 204b\n"
+ "205:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ ".inst 0x4f60f0c8 // bfdot v8.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0cc // bfdot v12.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d0 // bfdot v16.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d4 // bfdot v20.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0d8 // bfdot v24.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0dc // bfdot v28.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ ".inst 0x4f60f0e9 // bfdot v9.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ed // bfdot v13.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f1 // bfdot v17.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f5 // bfdot v21.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0f9 // bfdot v25.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0fd // bfdot v29.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ ".inst 0x4f60f0ca // bfdot v10.4s, v6.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ce // bfdot v14.4s, v6.8h, v1.h[1]\n"
+ ".inst 0x4f62f0d2 // bfdot v18.4s, v6.8h, v2.h[1]\n"
+ ".inst 0x4f63f0d6 // bfdot v22.4s, v6.8h, v3.h[1]\n"
+ ".inst 0x4f64f0da // bfdot v26.4s, v6.8h, v4.h[1]\n"
+ ".inst 0x4f65f0de // bfdot v30.4s, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ ".inst 0x4f60f0eb // bfdot v11.4s, v7.8h, v0.h[1]\n"
+ ".inst 0x4f61f0ef // bfdot v15.4s, v7.8h, v1.h[1]\n"
+ ".inst 0x4f62f0f3 // bfdot v19.4s, v7.8h, v2.h[1]\n"
+ ".inst 0x4f63f0f7 // bfdot v23.4s, v7.8h, v3.h[1]\n"
+ ".inst 0x4f64f0fb // bfdot v27.4s, v7.8h, v4.h[1]\n"
+ ".inst 0x4f65f0ff // bfdot v31.4s, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ ".inst 0x4f40f8c8 // bfdot v8.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8cc // bfdot v12.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d0 // bfdot v16.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d4 // bfdot v20.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8d8 // bfdot v24.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8dc // bfdot v28.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ ".inst 0x4f40f8e9 // bfdot v9.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ed // bfdot v13.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f1 // bfdot v17.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f5 // bfdot v21.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8f9 // bfdot v25.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8fd // bfdot v29.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ ".inst 0x4f40f8ca // bfdot v10.4s, v6.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ce // bfdot v14.4s, v6.8h, v1.h[2]\n"
+ ".inst 0x4f42f8d2 // bfdot v18.4s, v6.8h, v2.h[2]\n"
+ ".inst 0x4f43f8d6 // bfdot v22.4s, v6.8h, v3.h[2]\n"
+ ".inst 0x4f44f8da // bfdot v26.4s, v6.8h, v4.h[2]\n"
+ ".inst 0x4f45f8de // bfdot v30.4s, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ ".inst 0x4f40f8eb // bfdot v11.4s, v7.8h, v0.h[2]\n"
+ ".inst 0x4f41f8ef // bfdot v15.4s, v7.8h, v1.h[2]\n"
+ ".inst 0x4f42f8f3 // bfdot v19.4s, v7.8h, v2.h[2]\n"
+ ".inst 0x4f43f8f7 // bfdot v23.4s, v7.8h, v3.h[2]\n"
+ ".inst 0x4f44f8fb // bfdot v27.4s, v7.8h, v4.h[2]\n"
+ ".inst 0x4f45f8ff // bfdot v31.4s, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ ".inst 0x4f60f8c8 // bfdot v8.4s, v6.8h, v0.h[3]\n"
+ ".inst 0x4f61f8cc // bfdot v12.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d0 // bfdot v16.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d4 // bfdot v20.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8d8 // bfdot v24.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8dc // bfdot v28.4s, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ ".inst 0x4f60f8e9 // bfdot v9.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ed // bfdot v13.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f1 // bfdot v17.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f5 // bfdot v21.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8f9 // bfdot v25.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8fd // bfdot v29.4s, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ ".inst 0x4f60f8ca // bfdot v10.4s, v6.8h, v0.h[3]\n"
+ "add x15, x15, #0x100\n"
+ ".inst 0x4f61f8ce // bfdot v14.4s, v6.8h, v1.h[3]\n"
+ ".inst 0x4f62f8d2 // bfdot v18.4s, v6.8h, v2.h[3]\n"
+ ".inst 0x4f63f8d6 // bfdot v22.4s, v6.8h, v3.h[3]\n"
+ ".inst 0x4f64f8da // bfdot v26.4s, v6.8h, v4.h[3]\n"
+ ".inst 0x4f65f8de // bfdot v30.4s, v6.8h, v5.h[3]\n"
+ ".inst 0x4f60f8eb // bfdot v11.4s, v7.8h, v0.h[3]\n"
+ ".inst 0x4f61f8ef // bfdot v15.4s, v7.8h, v1.h[3]\n"
+ ".inst 0x4f62f8f3 // bfdot v19.4s, v7.8h, v2.h[3]\n"
+ ".inst 0x4f63f8f7 // bfdot v23.4s, v7.8h, v3.h[3]\n"
+ ".inst 0x4f64f8fb // bfdot v27.4s, v7.8h, v4.h[3]\n"
+ ".inst 0x4f65f8ff // bfdot v31.4s, v7.8h, v5.h[3]\n"
+ "206:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 211f\n"
+ "cmp x11, #0x2\n"
+ "blt 208f\n"
+ "207:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x2\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ "cmp x11, #0x2\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "bge 207b\n"
+ "cbz x11, 211f\n"
+ "208:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 209f\n"
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "tbz x11, #0, 210f\n"
+ "ld1 { v0.h }[2], [x10]\n"
+ "ld1 { v1.h }[2], [x28]\n"
+ "ld1 { v2.h }[2], [x26]\n"
+ "ld1 { v3.h }[2], [x24]\n"
+ "ld1 { v4.h }[2], [x22]\n"
+ "ld1 { v5.h }[2], [x20]\n"
+ "b 210f\n"
+ "209:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr h0, [x10, #0x0]\n"
+ "ldr h1, [x28, #0x0]\n"
+ "ldr h2, [x26, #0x0]\n"
+ "ldr h3, [x24, #0x0]\n"
+ "ldr h4, [x22, #0x0]\n"
+ "ldr h5, [x20, #0x0]\n"
+ "210:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x15, #0x0]\n"
+ ".inst 0x4f40f0c8 // bfdot v8.4s, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ ".inst 0x4f41f0cc // bfdot v12.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d0 // bfdot v16.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d4 // bfdot v20.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0d8 // bfdot v24.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0dc // bfdot v28.4s, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ ".inst 0x4f40f0e9 // bfdot v9.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ed // bfdot v13.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f1 // bfdot v17.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f5 // bfdot v21.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0f9 // bfdot v25.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0fd // bfdot v29.4s, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ ".inst 0x4f40f0ca // bfdot v10.4s, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ ".inst 0x4f41f0ce // bfdot v14.4s, v6.8h, v1.h[0]\n"
+ ".inst 0x4f42f0d2 // bfdot v18.4s, v6.8h, v2.h[0]\n"
+ ".inst 0x4f43f0d6 // bfdot v22.4s, v6.8h, v3.h[0]\n"
+ ".inst 0x4f44f0da // bfdot v26.4s, v6.8h, v4.h[0]\n"
+ ".inst 0x4f45f0de // bfdot v30.4s, v6.8h, v5.h[0]\n"
+ ".inst 0x4f40f0eb // bfdot v11.4s, v7.8h, v0.h[0]\n"
+ ".inst 0x4f41f0ef // bfdot v15.4s, v7.8h, v1.h[0]\n"
+ ".inst 0x4f42f0f3 // bfdot v19.4s, v7.8h, v2.h[0]\n"
+ ".inst 0x4f43f0f7 // bfdot v23.4s, v7.8h, v3.h[0]\n"
+ ".inst 0x4f44f0fb // bfdot v27.4s, v7.8h, v4.h[0]\n"
+ ".inst 0x4f45f0ff // bfdot v31.4s, v7.8h, v5.h[0]\n"
+ "211:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 201b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 212f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmin v28.4s, v28.4s, v0.4s\n"
+ "fmin v29.4s, v29.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "fmax v28.4s, v28.4s, v1.4s\n"
+ "fmax v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v0.4s\n"
+ "fmin v31.4s, v31.4s, v0.4s\n"
+ "fmax v30.4s, v30.4s, v1.4s\n"
+ "fmax v31.4s, v31.4s, v1.4s\n"
+ "212:" // Height 6: No activation
+ "cmp x16, #0x10\n"
+ "bge 221f\n"
+ "tbz x16, #3, 216f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 214f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 213f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 220f\n"
+ "213:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 220f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 220f\n"
+ "214:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 215f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 220f\n"
+ "215:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 220f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 220f\n"
+ "216:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 218f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 217f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 220f\n"
+ "217:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 220f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 220f\n"
+ "218:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 219f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 220f\n"
+ "219:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "220:" // Height 6: Partial direct writeback: Done
+ "b 222f\n"
+ "221:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "222:" // Height 6: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 188b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 224f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 223f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "223:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "224:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
new file mode 100644
index 0000000000..46de98504e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<__fp16>, \
+ size_t, size_t, \
+ const __fp16 *, \
+ IndirectOutputArg<__fp16>, \
+ const __fp16 *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp16_mla_6x32( ARGLIST );
+
+class cls_a64_hybrid_fp16_mla_6x32
+{
+public:
+ typedef __fp16 operand_type;
+ typedef __fp16 result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 32;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 32, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_fp16_mla_6x32;
+
+ cls_a64_hybrid_fp16_mla_6x32(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
new file mode 100644
index 0000000000..ff6cbec200
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp16_mla_6x32/generic.cpp
@@ -0,0 +1,5400 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#if defined(__aarch64__) && (defined(FP16_KERNELS) || defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC))
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp16_mla_6x32 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+ size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+ const __fp16 *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const __fp16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<__fp16>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+#ifndef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+ ".arch armv8.2-a+fp16\n"
+#endif
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 251f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 201f\n"
+ "beq 151f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 101f\n"
+ "beq 51f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x14, 4f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "b 23f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 22f\n"
+ "cmp x16, #0x20\n"
+ "bge 21f\n"
+ "tbz x16, #4, 12f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "tbz x16, #3, 8f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 6f\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x16, #1, 5f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "b 20f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "b 20f\n"
+ "6:" // Height 1: Partial accumulate: partial_2_24
+ "tbz x16, #1, 7f\n"
+ "ldr s11, [x13], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "b 20f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 20f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "b 20f\n"
+ "8:" // Height 1: Partial accumulate: partial_4_16
+ "tbz x16, #2, 10f\n"
+ "ldr d10, [x13], #0x8\n"
+ "tbz x16, #1, 9f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "b 20f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "b 20f\n"
+ "10:" // Height 1: Partial accumulate: partial_2_16
+ "tbz x16, #1, 11f\n"
+ "ldr s10, [x13], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "b 20f\n"
+ "11:" // Height 1: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 20f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "b 20f\n"
+ "12:" // Height 1: Partial accumulate: partial_8_0
+ "tbz x16, #3, 16f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 14f\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x16, #1, 13f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "b 20f\n"
+ "13:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "b 20f\n"
+ "14:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x16, #1, 15f\n"
+ "ldr s9, [x13], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "b 20f\n"
+ "15:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 20f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "b 20f\n"
+ "16:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x16, #2, 18f\n"
+ "ldr d8, [x13], #0x8\n"
+ "tbz x16, #1, 17f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "b 20f\n"
+ "17:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "b 20f\n"
+ "18:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x16, #1, 19f\n"
+ "ldr s8, [x13], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 20f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "b 20f\n"
+ "19:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "20:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 23f\n"
+ "21:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 23f\n"
+ "22:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "23:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "24:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 25f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 26f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 26f\n"
+ "25:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "26:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "blt 29f\n"
+ "cmp x11, #0x10\n"
+ "blt 28f\n"
+ "27:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "cmp x11, #0x10\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "bge 27b\n"
+ "28:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "29:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 31f\n"
+ "30:" // Height 1: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "add x15, x15, #0x40\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "cbnz x11, 30b\n"
+ "31:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 24b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 32f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "32:" // Height 1: No activation
+ "cmp x16, #0x20\n"
+ "bge 49f\n"
+ "tbz x16, #4, 40f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "tbz x16, #3, 36f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 34f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x16, #1, 33f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "b 48f\n"
+ "33:" // Height 1: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 48f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "b 48f\n"
+ "34:" // Height 1: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 35f\n"
+ "str s11, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "b 48f\n"
+ "35:" // Height 1: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 48f\n"
+ "str h11, [x13, #0x0]\n"
+ "b 48f\n"
+ "36:" // Height 1: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 38f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x16, #1, 37f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "b 48f\n"
+ "37:" // Height 1: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 48f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "b 48f\n"
+ "38:" // Height 1: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 39f\n"
+ "str s10, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "b 48f\n"
+ "39:" // Height 1: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 48f\n"
+ "str h10, [x13, #0x0]\n"
+ "b 48f\n"
+ "40:" // Height 1: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 44f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "tbz x16, #2, 42f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x16, #1, 41f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "b 48f\n"
+ "41:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 48f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "b 48f\n"
+ "42:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 43f\n"
+ "str s9, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "b 48f\n"
+ "43:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 48f\n"
+ "str h9, [x13, #0x0]\n"
+ "b 48f\n"
+ "44:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 46f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x16, #1, 45f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "b 48f\n"
+ "45:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 48f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "b 48f\n"
+ "46:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 47f\n"
+ "str s8, [x13], #0x4\n"
+ "tbz x16, #0, 48f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "b 48f\n"
+ "47:" // Height 1: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "48:" // Height 1: Partial direct writeback: Done
+ "b 50f\n"
+ "49:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "50:" // Height 1: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 3b\n"
+ "b 302f\n"
+ "51:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 52f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "b 53f\n"
+ "52:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "53:" // Height 2: Column loop
+ "cbz x14, 54f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v13.16b, v9.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v14.16b, v10.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "b 73f\n"
+ "54:" // Height 2: no bias
+ "tbz %x[flags], #0, 72f\n"
+ "cmp x16, #0x20\n"
+ "bge 71f\n"
+ "tbz x16, #4, 62f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "tbz x16, #3, 58f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 56f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x16, #1, 55f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "b 70f\n"
+ "55:" // Height 2: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "b 70f\n"
+ "56:" // Height 2: Partial accumulate: partial_2_24
+ "tbz x16, #1, 57f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "b 70f\n"
+ "57:" // Height 2: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 70f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "b 70f\n"
+ "58:" // Height 2: Partial accumulate: partial_4_16
+ "tbz x16, #2, 60f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "tbz x16, #1, 59f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "b 70f\n"
+ "59:" // Height 2: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "b 70f\n"
+ "60:" // Height 2: Partial accumulate: partial_2_16
+ "tbz x16, #1, 61f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "b 70f\n"
+ "61:" // Height 2: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 70f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "b 70f\n"
+ "62:" // Height 2: Partial accumulate: partial_8_0
+ "tbz x16, #3, 66f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 64f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x16, #1, 63f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "b 70f\n"
+ "63:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "b 70f\n"
+ "64:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x16, #1, 65f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "b 70f\n"
+ "65:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 70f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "b 70f\n"
+ "66:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x16, #2, 68f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "tbz x16, #1, 67f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "b 70f\n"
+ "67:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "b 70f\n"
+ "68:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x16, #1, 69f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 70f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "b 70f\n"
+ "69:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "70:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 73f\n"
+ "71:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 73f\n"
+ "72:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "73:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "74:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 75f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 76f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 76f\n"
+ "75:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "76:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "blt 79f\n"
+ "cmp x11, #0x10\n"
+ "blt 78f\n"
+ "77:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "cmp x11, #0x10\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "bge 77b\n"
+ "78:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "79:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 81f\n"
+ "80:" // Height 2: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "cbnz x11, 80b\n"
+ "81:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 74b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 82f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "82:" // Height 2: No activation
+ "cmp x16, #0x20\n"
+ "bge 99f\n"
+ "tbz x16, #4, 90f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "tbz x16, #3, 86f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 84f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x16, #1, 83f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "b 98f\n"
+ "83:" // Height 2: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 98f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "b 98f\n"
+ "84:" // Height 2: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 85f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "b 98f\n"
+ "85:" // Height 2: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 98f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "b 98f\n"
+ "86:" // Height 2: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 88f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x16, #1, 87f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "b 98f\n"
+ "87:" // Height 2: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 98f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "b 98f\n"
+ "88:" // Height 2: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 89f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "b 98f\n"
+ "89:" // Height 2: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 98f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "b 98f\n"
+ "90:" // Height 2: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 94f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "tbz x16, #2, 92f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x16, #1, 91f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "b 98f\n"
+ "91:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 98f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "b 98f\n"
+ "92:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 93f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "b 98f\n"
+ "93:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 98f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "b 98f\n"
+ "94:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 96f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x16, #1, 95f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "b 98f\n"
+ "95:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 98f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "b 98f\n"
+ "96:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 97f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "tbz x16, #0, 98f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "b 98f\n"
+ "97:" // Height 2: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "98:" // Height 2: Partial direct writeback: Done
+ "b 100f\n"
+ "99:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "100:" // Height 2: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 53b\n"
+ "b 302f\n"
+ "101:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 102f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "add x27, x27, x19, LSL #1\n"
+ "b 103f\n"
+ "102:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "103:" // Height 3: Column loop
+ "cbz x14, 104f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 123f\n"
+ "104:" // Height 3: no bias
+ "tbz %x[flags], #0, 122f\n"
+ "cmp x16, #0x20\n"
+ "bge 121f\n"
+ "tbz x16, #4, 112f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "tbz x16, #3, 108f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 106f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x16, #1, 105f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "b 120f\n"
+ "105:" // Height 3: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "b 120f\n"
+ "106:" // Height 3: Partial accumulate: partial_2_24
+ "tbz x16, #1, 107f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "b 120f\n"
+ "107:" // Height 3: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 120f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "b 120f\n"
+ "108:" // Height 3: Partial accumulate: partial_4_16
+ "tbz x16, #2, 110f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "tbz x16, #1, 109f\n"
+ "mov x19, #0x2c\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "b 120f\n"
+ "109:" // Height 3: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "b 120f\n"
+ "110:" // Height 3: Partial accumulate: partial_2_16
+ "tbz x16, #1, 111f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "b 120f\n"
+ "111:" // Height 3: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 120f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "b 120f\n"
+ "112:" // Height 3: Partial accumulate: partial_8_0
+ "tbz x16, #3, 116f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 114f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x16, #1, 113f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "b 120f\n"
+ "113:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "b 120f\n"
+ "114:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x16, #1, 115f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "b 120f\n"
+ "115:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 120f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "b 120f\n"
+ "116:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x16, #2, 118f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "tbz x16, #1, 117f\n"
+ "mov x19, #0xc\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "b 120f\n"
+ "117:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "b 120f\n"
+ "118:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x16, #1, 119f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 120f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "b 120f\n"
+ "119:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "120:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 123f\n"
+ "121:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 123f\n"
+ "122:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "123:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "124:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 125f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 126f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 126f\n"
+ "125:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "126:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "blt 129f\n"
+ "cmp x11, #0x10\n"
+ "blt 128f\n"
+ "127:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "bge 127b\n"
+ "128:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "129:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 131f\n"
+ "130:" // Height 3: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "cbnz x11, 130b\n"
+ "131:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 124b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 132f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "132:" // Height 3: No activation
+ "cmp x16, #0x20\n"
+ "bge 149f\n"
+ "tbz x16, #4, 140f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "tbz x16, #3, 136f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 134f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x16, #1, 133f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "b 148f\n"
+ "133:" // Height 3: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 148f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "b 148f\n"
+ "134:" // Height 3: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 135f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "b 148f\n"
+ "135:" // Height 3: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 148f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "b 148f\n"
+ "136:" // Height 3: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 138f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x16, #1, 137f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "b 148f\n"
+ "137:" // Height 3: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 148f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "b 148f\n"
+ "138:" // Height 3: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 139f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "b 148f\n"
+ "139:" // Height 3: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 148f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "b 148f\n"
+ "140:" // Height 3: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 144f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "tbz x16, #2, 142f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x16, #1, 141f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "b 148f\n"
+ "141:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 148f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "b 148f\n"
+ "142:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 143f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "b 148f\n"
+ "143:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 148f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "b 148f\n"
+ "144:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 146f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x16, #1, 145f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "b 148f\n"
+ "145:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 148f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "b 148f\n"
+ "146:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 147f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "tbz x16, #0, 148f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "b 148f\n"
+ "147:" // Height 3: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "148:" // Height 3: Partial direct writeback: Done
+ "b 150f\n"
+ "149:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "150:" // Height 3: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 103b\n"
+ "b 302f\n"
+ "151:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 152f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 153f\n"
+ "152:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "153:" // Height 4: Column loop
+ "cbz x14, 154f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 173f\n"
+ "154:" // Height 4: no bias
+ "tbz %x[flags], #0, 172f\n"
+ "cmp x16, #0x20\n"
+ "bge 171f\n"
+ "tbz x16, #4, 162f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "ld1 { v21.8h }, [x25], #0x10\n"
+ "tbz x16, #3, 158f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "ld1 { v22.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 156f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x16, #1, 155f\n"
+ "mov x19, #0x3c\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x25]\n"
+ "b 170f\n"
+ "155:" // Height 4: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "ld1 { v23.h }[4], [x25]\n"
+ "b 170f\n"
+ "156:" // Height 4: Partial accumulate: partial_2_24
+ "tbz x16, #1, 157f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x25]\n"
+ "b 170f\n"
+ "157:" // Height 4: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 170f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "ldr h23, [x25, #0x0]\n"
+ "b 170f\n"
+ "158:" // Height 4: Partial accumulate: partial_4_16
+ "tbz x16, #2, 160f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "tbz x16, #1, 159f\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "mov x19, #0x2c\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "b 170f\n"
+ "159:" // Height 4: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "b 170f\n"
+ "160:" // Height 4: Partial accumulate: partial_2_16
+ "tbz x16, #1, 161f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "b 170f\n"
+ "161:" // Height 4: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 170f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "b 170f\n"
+ "162:" // Height 4: Partial accumulate: partial_8_0
+ "tbz x16, #3, 166f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 164f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x16, #1, 163f\n"
+ "mov x19, #0x1c\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "ld1 { v21.h }[6], [x25]\n"
+ "b 170f\n"
+ "163:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "ld1 { v21.h }[4], [x25]\n"
+ "b 170f\n"
+ "164:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x16, #1, 165f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "ld1 { v21.h }[2], [x25]\n"
+ "b 170f\n"
+ "165:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 170f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "ldr h21, [x25, #0x0]\n"
+ "b 170f\n"
+ "166:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x16, #2, 168f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "tbz x16, #1, 167f\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v20.s }[2], [x25], #0x4\n"
+ "mov x19, #0xc\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "ld1 { v20.h }[6], [x25]\n"
+ "b 170f\n"
+ "167:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "ld1 { v20.h }[4], [x25]\n"
+ "b 170f\n"
+ "168:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x16, #1, 169f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s20, [x25], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 170f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "ld1 { v20.h }[2], [x25]\n"
+ "b 170f\n"
+ "169:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "ldr h20, [x25, #0x0]\n"
+ "170:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 173f\n"
+ "171:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 173f\n"
+ "172:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "173:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "174:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 175f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 176f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 176f\n"
+ "175:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "176:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "blt 179f\n"
+ "cmp x11, #0x10\n"
+ "blt 178f\n"
+ "177:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "bge 177b\n"
+ "178:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "179:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 181f\n"
+ "180:" // Height 4: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "cbnz x11, 180b\n"
+ "181:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 174b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 182f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "182:" // Height 4: No activation
+ "cmp x16, #0x20\n"
+ "bge 199f\n"
+ "tbz x16, #4, 190f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v21.8h }, [x25], #0x10\n"
+ "tbz x16, #3, 186f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "st1 { v22.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 184f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x16, #1, 183f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "st1 { v23.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "st1 { v23.h }[6], [x25]\n"
+ "b 198f\n"
+ "183:" // Height 4: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 198f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "st1 { v23.h }[4], [x25]\n"
+ "b 198f\n"
+ "184:" // Height 4: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 185f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "str s23, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "st1 { v23.h }[2], [x25]\n"
+ "b 198f\n"
+ "185:" // Height 4: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 198f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "str h23, [x25, #0x0]\n"
+ "b 198f\n"
+ "186:" // Height 4: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 188f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x16, #1, 187f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "st1 { v22.h }[6], [x25]\n"
+ "b 198f\n"
+ "187:" // Height 4: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 198f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "st1 { v22.h }[4], [x25]\n"
+ "b 198f\n"
+ "188:" // Height 4: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 189f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "str s22, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "st1 { v22.h }[2], [x25]\n"
+ "b 198f\n"
+ "189:" // Height 4: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 198f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "str h22, [x25, #0x0]\n"
+ "b 198f\n"
+ "190:" // Height 4: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 194f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "tbz x16, #2, 192f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x16, #1, 191f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "st1 { v21.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "st1 { v21.h }[6], [x25]\n"
+ "b 198f\n"
+ "191:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 198f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "st1 { v21.h }[4], [x25]\n"
+ "b 198f\n"
+ "192:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 193f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "str s21, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "st1 { v21.h }[2], [x25]\n"
+ "b 198f\n"
+ "193:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 198f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "str h21, [x25, #0x0]\n"
+ "b 198f\n"
+ "194:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 196f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x16, #1, 195f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "st1 { v20.h }[6], [x25]\n"
+ "b 198f\n"
+ "195:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 198f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "st1 { v20.h }[4], [x25]\n"
+ "b 198f\n"
+ "196:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 197f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x16, #0, 198f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "st1 { v20.h }[2], [x25]\n"
+ "b 198f\n"
+ "197:" // Height 4: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "str h20, [x25, #0x0]\n"
+ "198:" // Height 4: Partial direct writeback: Done
+ "b 200f\n"
+ "199:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "200:" // Height 4: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 153b\n"
+ "b 302f\n"
+ "201:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 202f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 203f\n"
+ "202:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "203:" // Height 5: Column loop
+ "cbz x14, 204f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 223f\n"
+ "204:" // Height 5: no bias
+ "tbz %x[flags], #0, 222f\n"
+ "cmp x16, #0x20\n"
+ "bge 221f\n"
+ "tbz x16, #4, 212f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "ld1 { v21.8h }, [x25], #0x10\n"
+ "ld1 { v25.8h }, [x23], #0x10\n"
+ "tbz x16, #3, 208f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "ld1 { v22.8h }, [x25], #0x10\n"
+ "ld1 { v26.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 206f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x16, #1, 205f\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "mov x19, #0x3c\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x23]\n"
+ "b 220f\n"
+ "205:" // Height 5: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "ld1 { v23.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x23]\n"
+ "b 220f\n"
+ "206:" // Height 5: Partial accumulate: partial_2_24
+ "tbz x16, #1, 207f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x23]\n"
+ "b 220f\n"
+ "207:" // Height 5: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 220f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "ldr h23, [x25, #0x0]\n"
+ "ldr h27, [x23, #0x0]\n"
+ "b 220f\n"
+ "208:" // Height 5: Partial accumulate: partial_4_16
+ "tbz x16, #2, 210f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "tbz x16, #1, 209f\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x23], #0x4\n"
+ "mov x19, #0x2c\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x23]\n"
+ "b 220f\n"
+ "209:" // Height 5: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x23]\n"
+ "b 220f\n"
+ "210:" // Height 5: Partial accumulate: partial_2_16
+ "tbz x16, #1, 211f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x23]\n"
+ "b 220f\n"
+ "211:" // Height 5: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 220f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "ldr h26, [x23, #0x0]\n"
+ "b 220f\n"
+ "212:" // Height 5: Partial accumulate: partial_8_0
+ "tbz x16, #3, 216f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 214f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x16, #1, 213f\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "mov x19, #0x1c\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "ld1 { v21.h }[6], [x25]\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "b 220f\n"
+ "213:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "ld1 { v21.h }[4], [x25]\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "b 220f\n"
+ "214:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x16, #1, 215f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "ld1 { v21.h }[2], [x25]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "b 220f\n"
+ "215:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 220f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "ldr h21, [x25, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "b 220f\n"
+ "216:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x16, #2, 218f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "tbz x16, #1, 217f\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v20.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "mov x19, #0xc\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "ld1 { v20.h }[6], [x25]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "b 220f\n"
+ "217:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "ld1 { v20.h }[4], [x25]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "b 220f\n"
+ "218:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x16, #1, 219f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 220f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "ld1 { v20.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "b 220f\n"
+ "219:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "ldr h20, [x25, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "220:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 223f\n"
+ "221:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 223f\n"
+ "222:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "223:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "224:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 225f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 226f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 226f\n"
+ "225:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "226:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "blt 229f\n"
+ "cmp x11, #0x10\n"
+ "blt 228f\n"
+ "227:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "bge 227b\n"
+ "228:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "229:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 231f\n"
+ "230:" // Height 5: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "cbnz x11, 230b\n"
+ "231:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 224b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 232f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v0.8h\n"
+ "fmin v26.8h, v26.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v1.8h\n"
+ "fmax v25.8h, v25.8h, v1.8h\n"
+ "fmax v26.8h, v26.8h, v1.8h\n"
+ "fmin v27.8h, v27.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v1.8h\n"
+ "232:" // Height 5: No activation
+ "cmp x16, #0x20\n"
+ "bge 249f\n"
+ "tbz x16, #4, 240f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v21.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v25.8h }, [x23], #0x10\n"
+ "tbz x16, #3, 236f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "st1 { v22.8h }, [x25], #0x10\n"
+ "st1 { v26.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 234f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x16, #1, 233f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "st1 { v23.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "st1 { v23.h }[6], [x25]\n"
+ "st1 { v27.h }[6], [x23]\n"
+ "b 248f\n"
+ "233:" // Height 5: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 248f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "st1 { v23.h }[4], [x25]\n"
+ "st1 { v27.h }[4], [x23]\n"
+ "b 248f\n"
+ "234:" // Height 5: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 235f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "str s23, [x25], #0x4\n"
+ "str s27, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "st1 { v23.h }[2], [x25]\n"
+ "st1 { v27.h }[2], [x23]\n"
+ "b 248f\n"
+ "235:" // Height 5: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 248f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "str h23, [x25, #0x0]\n"
+ "str h27, [x23, #0x0]\n"
+ "b 248f\n"
+ "236:" // Height 5: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 238f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x16, #1, 237f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "st1 { v26.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "st1 { v22.h }[6], [x25]\n"
+ "st1 { v26.h }[6], [x23]\n"
+ "b 248f\n"
+ "237:" // Height 5: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 248f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "st1 { v22.h }[4], [x25]\n"
+ "st1 { v26.h }[4], [x23]\n"
+ "b 248f\n"
+ "238:" // Height 5: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 239f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "str s22, [x25], #0x4\n"
+ "str s26, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "st1 { v22.h }[2], [x25]\n"
+ "st1 { v26.h }[2], [x23]\n"
+ "b 248f\n"
+ "239:" // Height 5: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 248f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "str h22, [x25, #0x0]\n"
+ "str h26, [x23, #0x0]\n"
+ "b 248f\n"
+ "240:" // Height 5: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 244f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "tbz x16, #2, 242f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x16, #1, 241f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "st1 { v21.s }[2], [x25], #0x4\n"
+ "st1 { v25.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "st1 { v21.h }[6], [x25]\n"
+ "st1 { v25.h }[6], [x23]\n"
+ "b 248f\n"
+ "241:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 248f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "st1 { v21.h }[4], [x25]\n"
+ "st1 { v25.h }[4], [x23]\n"
+ "b 248f\n"
+ "242:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 243f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "str s21, [x25], #0x4\n"
+ "str s25, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "st1 { v21.h }[2], [x25]\n"
+ "st1 { v25.h }[2], [x23]\n"
+ "b 248f\n"
+ "243:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 248f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "str h21, [x25, #0x0]\n"
+ "str h25, [x23, #0x0]\n"
+ "b 248f\n"
+ "244:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 246f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x16, #1, 245f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "st1 { v20.h }[6], [x25]\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "b 248f\n"
+ "245:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 248f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "st1 { v20.h }[4], [x25]\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "b 248f\n"
+ "246:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 247f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x16, #0, 248f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "st1 { v20.h }[2], [x25]\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "b 248f\n"
+ "247:" // Height 5: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "str h20, [x25, #0x0]\n"
+ "str h24, [x23, #0x0]\n"
+ "248:" // Height 5: Partial direct writeback: Done
+ "b 250f\n"
+ "249:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "250:" // Height 5: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 203b\n"
+ "b 302f\n"
+ "251:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 252f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 253f\n"
+ "252:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "add x21, x23, x19, LSL #1\n"
+ "add %x[output_ptr], x21, x19, LSL #1\n"
+ "253:" // Height 6: Column loop
+ "cbz x14, 254f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 273f\n"
+ "254:" // Height 6: no bias
+ "tbz %x[flags], #0, 272f\n"
+ "cmp x16, #0x20\n"
+ "bge 271f\n"
+ "tbz x16, #4, 262f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
+ "ld1 { v9.8h }, [x13], #0x10\n"
+ "ld1 { v13.8h }, [x9], #0x10\n"
+ "ld1 { v17.8h }, [x27], #0x10\n"
+ "ld1 { v21.8h }, [x25], #0x10\n"
+ "ld1 { v25.8h }, [x23], #0x10\n"
+ "ld1 { v29.8h }, [x21], #0x10\n"
+ "tbz x16, #3, 258f\n"
+ "ld1 { v10.8h }, [x13], #0x10\n"
+ "ld1 { v14.8h }, [x9], #0x10\n"
+ "ld1 { v18.8h }, [x27], #0x10\n"
+ "ld1 { v22.8h }, [x25], #0x10\n"
+ "ld1 { v26.8h }, [x23], #0x10\n"
+ "ld1 { v30.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 256f\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x16, #1, 255f\n"
+ "ld1 { v11.s }[2], [x13], #0x4\n"
+ "ld1 { v15.s }[2], [x9], #0x4\n"
+ "ld1 { v19.s }[2], [x27], #0x4\n"
+ "ld1 { v23.s }[2], [x25], #0x4\n"
+ "ld1 { v27.s }[2], [x23], #0x4\n"
+ "ld1 { v31.s }[2], [x21], #0x4\n"
+ "mov x19, #0x3c\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v11.h }[6], [x13]\n"
+ "ld1 { v15.h }[6], [x9]\n"
+ "ld1 { v19.h }[6], [x27]\n"
+ "ld1 { v23.h }[6], [x25]\n"
+ "ld1 { v27.h }[6], [x23]\n"
+ "ld1 { v31.h }[6], [x21]\n"
+ "b 270f\n"
+ "255:" // Height 6: Partial accumulate: partial_1_28
+ "mov x19, #0x38\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v11.h }[4], [x13]\n"
+ "ld1 { v15.h }[4], [x9]\n"
+ "ld1 { v19.h }[4], [x27]\n"
+ "ld1 { v23.h }[4], [x25]\n"
+ "ld1 { v27.h }[4], [x23]\n"
+ "ld1 { v31.h }[4], [x21]\n"
+ "b 270f\n"
+ "256:" // Height 6: Partial accumulate: partial_2_24
+ "tbz x16, #1, 257f\n"
+ "ldr s11, [x13], #0x4\n"
+ "ldr s15, [x9], #0x4\n"
+ "ldr s19, [x27], #0x4\n"
+ "ldr s23, [x25], #0x4\n"
+ "ldr s27, [x23], #0x4\n"
+ "ldr s31, [x21], #0x4\n"
+ "mov x19, #0x34\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v11.h }[2], [x13]\n"
+ "ld1 { v15.h }[2], [x9]\n"
+ "ld1 { v19.h }[2], [x27]\n"
+ "ld1 { v23.h }[2], [x25]\n"
+ "ld1 { v27.h }[2], [x23]\n"
+ "ld1 { v31.h }[2], [x21]\n"
+ "b 270f\n"
+ "257:" // Height 6: Partial accumulate: partial_1_24
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 270f\n"
+ "ldr h11, [x13, #0x0]\n"
+ "ldr h15, [x9, #0x0]\n"
+ "ldr h19, [x27, #0x0]\n"
+ "ldr h23, [x25, #0x0]\n"
+ "ldr h27, [x23, #0x0]\n"
+ "ldr h31, [x21, #0x0]\n"
+ "b 270f\n"
+ "258:" // Height 6: Partial accumulate: partial_4_16
+ "tbz x16, #2, 260f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "tbz x16, #1, 259f\n"
+ "ld1 { v10.s }[2], [x13], #0x4\n"
+ "ld1 { v14.s }[2], [x9], #0x4\n"
+ "ld1 { v18.s }[2], [x27], #0x4\n"
+ "ld1 { v22.s }[2], [x25], #0x4\n"
+ "ld1 { v26.s }[2], [x23], #0x4\n"
+ "ld1 { v30.s }[2], [x21], #0x4\n"
+ "mov x19, #0x2c\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v10.h }[6], [x13]\n"
+ "ld1 { v14.h }[6], [x9]\n"
+ "ld1 { v18.h }[6], [x27]\n"
+ "ld1 { v22.h }[6], [x25]\n"
+ "ld1 { v26.h }[6], [x23]\n"
+ "ld1 { v30.h }[6], [x21]\n"
+ "b 270f\n"
+ "259:" // Height 6: Partial accumulate: partial_1_20
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v10.h }[4], [x13]\n"
+ "ld1 { v14.h }[4], [x9]\n"
+ "ld1 { v18.h }[4], [x27]\n"
+ "ld1 { v22.h }[4], [x25]\n"
+ "ld1 { v26.h }[4], [x23]\n"
+ "ld1 { v30.h }[4], [x21]\n"
+ "b 270f\n"
+ "260:" // Height 6: Partial accumulate: partial_2_16
+ "tbz x16, #1, 261f\n"
+ "ldr s10, [x13], #0x4\n"
+ "ldr s14, [x9], #0x4\n"
+ "ldr s18, [x27], #0x4\n"
+ "ldr s22, [x25], #0x4\n"
+ "ldr s26, [x23], #0x4\n"
+ "ldr s30, [x21], #0x4\n"
+ "mov x19, #0x24\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v10.h }[2], [x13]\n"
+ "ld1 { v14.h }[2], [x9]\n"
+ "ld1 { v18.h }[2], [x27]\n"
+ "ld1 { v22.h }[2], [x25]\n"
+ "ld1 { v26.h }[2], [x23]\n"
+ "ld1 { v30.h }[2], [x21]\n"
+ "b 270f\n"
+ "261:" // Height 6: Partial accumulate: partial_1_16
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 270f\n"
+ "ldr h10, [x13, #0x0]\n"
+ "ldr h14, [x9, #0x0]\n"
+ "ldr h18, [x27, #0x0]\n"
+ "ldr h22, [x25, #0x0]\n"
+ "ldr h26, [x23, #0x0]\n"
+ "ldr h30, [x21, #0x0]\n"
+ "b 270f\n"
+ "262:" // Height 6: Partial accumulate: partial_8_0
+ "tbz x16, #3, 266f\n"
+ "ld1 { v8.8h }, [x13], #0x10\n"
+ "ld1 { v12.8h }, [x9], #0x10\n"
+ "ld1 { v16.8h }, [x27], #0x10\n"
+ "ld1 { v20.8h }, [x25], #0x10\n"
+ "ld1 { v24.8h }, [x23], #0x10\n"
+ "ld1 { v28.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 264f\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x16, #1, 263f\n"
+ "ld1 { v9.s }[2], [x13], #0x4\n"
+ "ld1 { v13.s }[2], [x9], #0x4\n"
+ "ld1 { v17.s }[2], [x27], #0x4\n"
+ "ld1 { v21.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x23], #0x4\n"
+ "ld1 { v29.s }[2], [x21], #0x4\n"
+ "mov x19, #0x1c\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v9.h }[6], [x13]\n"
+ "ld1 { v13.h }[6], [x9]\n"
+ "ld1 { v17.h }[6], [x27]\n"
+ "ld1 { v21.h }[6], [x25]\n"
+ "ld1 { v25.h }[6], [x23]\n"
+ "ld1 { v29.h }[6], [x21]\n"
+ "b 270f\n"
+ "263:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x18\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v9.h }[4], [x13]\n"
+ "ld1 { v13.h }[4], [x9]\n"
+ "ld1 { v17.h }[4], [x27]\n"
+ "ld1 { v21.h }[4], [x25]\n"
+ "ld1 { v25.h }[4], [x23]\n"
+ "ld1 { v29.h }[4], [x21]\n"
+ "b 270f\n"
+ "264:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x16, #1, 265f\n"
+ "ldr s9, [x13], #0x4\n"
+ "ldr s13, [x9], #0x4\n"
+ "ldr s17, [x27], #0x4\n"
+ "ldr s21, [x25], #0x4\n"
+ "ldr s25, [x23], #0x4\n"
+ "ldr s29, [x21], #0x4\n"
+ "mov x19, #0x14\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v9.h }[2], [x13]\n"
+ "ld1 { v13.h }[2], [x9]\n"
+ "ld1 { v17.h }[2], [x27]\n"
+ "ld1 { v21.h }[2], [x25]\n"
+ "ld1 { v25.h }[2], [x23]\n"
+ "ld1 { v29.h }[2], [x21]\n"
+ "b 270f\n"
+ "265:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 270f\n"
+ "ldr h9, [x13, #0x0]\n"
+ "ldr h13, [x9, #0x0]\n"
+ "ldr h17, [x27, #0x0]\n"
+ "ldr h21, [x25, #0x0]\n"
+ "ldr h25, [x23, #0x0]\n"
+ "ldr h29, [x21, #0x0]\n"
+ "b 270f\n"
+ "266:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x16, #2, 268f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "tbz x16, #1, 267f\n"
+ "ld1 { v8.s }[2], [x13], #0x4\n"
+ "ld1 { v12.s }[2], [x9], #0x4\n"
+ "ld1 { v16.s }[2], [x27], #0x4\n"
+ "ld1 { v20.s }[2], [x25], #0x4\n"
+ "ld1 { v24.s }[2], [x23], #0x4\n"
+ "ld1 { v28.s }[2], [x21], #0x4\n"
+ "mov x19, #0xc\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v8.h }[6], [x13]\n"
+ "ld1 { v12.h }[6], [x9]\n"
+ "ld1 { v16.h }[6], [x27]\n"
+ "ld1 { v20.h }[6], [x25]\n"
+ "ld1 { v24.h }[6], [x23]\n"
+ "ld1 { v28.h }[6], [x21]\n"
+ "b 270f\n"
+ "267:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v8.h }[4], [x13]\n"
+ "ld1 { v12.h }[4], [x9]\n"
+ "ld1 { v16.h }[4], [x27]\n"
+ "ld1 { v20.h }[4], [x25]\n"
+ "ld1 { v24.h }[4], [x23]\n"
+ "ld1 { v28.h }[4], [x21]\n"
+ "b 270f\n"
+ "268:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x16, #1, 269f\n"
+ "ldr s8, [x13], #0x4\n"
+ "ldr s12, [x9], #0x4\n"
+ "ldr s16, [x27], #0x4\n"
+ "ldr s20, [x25], #0x4\n"
+ "ldr s24, [x23], #0x4\n"
+ "ldr s28, [x21], #0x4\n"
+ "mov x19, #0x4\n"
+ "tbz x16, #0, 270f\n"
+ "ld1 { v8.h }[2], [x13]\n"
+ "ld1 { v12.h }[2], [x9]\n"
+ "ld1 { v16.h }[2], [x27]\n"
+ "ld1 { v20.h }[2], [x25]\n"
+ "ld1 { v24.h }[2], [x23]\n"
+ "ld1 { v28.h }[2], [x21]\n"
+ "b 270f\n"
+ "269:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr h8, [x13, #0x0]\n"
+ "ldr h12, [x9, #0x0]\n"
+ "ldr h16, [x27, #0x0]\n"
+ "ldr h20, [x25, #0x0]\n"
+ "ldr h24, [x23, #0x0]\n"
+ "ldr h28, [x21, #0x0]\n"
+ "270:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 273f\n"
+ "271:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 273f\n"
+ "272:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "273:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "274:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 275f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 276f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 276f\n"
+ "275:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "276:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "blt 279f\n"
+ "cmp x11, #0x10\n"
+ "blt 278f\n"
+ "277:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x8\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "cmp x11, #0x10\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "fmla v28.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "fmla v29.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "fmla v30.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "fmla v31.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "fmla v28.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "fmla v29.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "fmla v30.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "fmla v31.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "fmla v28.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "fmla v29.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "fmla v30.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "fmla v31.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "fmla v28.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "fmla v29.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "fmla v30.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "fmla v31.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "fmla v28.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "fmla v29.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "fmla v30.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "fmla v31.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "fmla v28.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "fmla v29.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "fmla v30.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "fmla v31.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "fmla v28.8h, v6.8h, v5.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "fmla v29.8h, v7.8h, v5.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v30.8h, v6.8h, v5.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v31.8h, v7.8h, v5.h[7]\n"
+ "bge 277b\n"
+ "278:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x8\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.8h, v6.8h, v0.h[1]\n"
+ "fmla v12.8h, v6.8h, v1.h[1]\n"
+ "fmla v16.8h, v6.8h, v2.h[1]\n"
+ "fmla v20.8h, v6.8h, v3.h[1]\n"
+ "fmla v24.8h, v6.8h, v4.h[1]\n"
+ "fmla v28.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.8h, v7.8h, v0.h[1]\n"
+ "fmla v13.8h, v7.8h, v1.h[1]\n"
+ "fmla v17.8h, v7.8h, v2.h[1]\n"
+ "fmla v21.8h, v7.8h, v3.h[1]\n"
+ "fmla v25.8h, v7.8h, v4.h[1]\n"
+ "fmla v29.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.8h, v6.8h, v0.h[1]\n"
+ "fmla v14.8h, v6.8h, v1.h[1]\n"
+ "fmla v18.8h, v6.8h, v2.h[1]\n"
+ "fmla v22.8h, v6.8h, v3.h[1]\n"
+ "fmla v26.8h, v6.8h, v4.h[1]\n"
+ "fmla v30.8h, v6.8h, v5.h[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.8h, v7.8h, v0.h[1]\n"
+ "fmla v15.8h, v7.8h, v1.h[1]\n"
+ "fmla v19.8h, v7.8h, v2.h[1]\n"
+ "fmla v23.8h, v7.8h, v3.h[1]\n"
+ "fmla v27.8h, v7.8h, v4.h[1]\n"
+ "fmla v31.8h, v7.8h, v5.h[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.8h, v6.8h, v0.h[2]\n"
+ "fmla v12.8h, v6.8h, v1.h[2]\n"
+ "fmla v16.8h, v6.8h, v2.h[2]\n"
+ "fmla v20.8h, v6.8h, v3.h[2]\n"
+ "fmla v24.8h, v6.8h, v4.h[2]\n"
+ "fmla v28.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.8h, v7.8h, v0.h[2]\n"
+ "fmla v13.8h, v7.8h, v1.h[2]\n"
+ "fmla v17.8h, v7.8h, v2.h[2]\n"
+ "fmla v21.8h, v7.8h, v3.h[2]\n"
+ "fmla v25.8h, v7.8h, v4.h[2]\n"
+ "fmla v29.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.8h, v6.8h, v0.h[2]\n"
+ "fmla v14.8h, v6.8h, v1.h[2]\n"
+ "fmla v18.8h, v6.8h, v2.h[2]\n"
+ "fmla v22.8h, v6.8h, v3.h[2]\n"
+ "fmla v26.8h, v6.8h, v4.h[2]\n"
+ "fmla v30.8h, v6.8h, v5.h[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.8h, v7.8h, v0.h[2]\n"
+ "fmla v15.8h, v7.8h, v1.h[2]\n"
+ "fmla v19.8h, v7.8h, v2.h[2]\n"
+ "fmla v23.8h, v7.8h, v3.h[2]\n"
+ "fmla v27.8h, v7.8h, v4.h[2]\n"
+ "fmla v31.8h, v7.8h, v5.h[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.8h, v6.8h, v0.h[3]\n"
+ "fmla v12.8h, v6.8h, v1.h[3]\n"
+ "fmla v16.8h, v6.8h, v2.h[3]\n"
+ "fmla v20.8h, v6.8h, v3.h[3]\n"
+ "fmla v24.8h, v6.8h, v4.h[3]\n"
+ "fmla v28.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.8h, v7.8h, v0.h[3]\n"
+ "fmla v13.8h, v7.8h, v1.h[3]\n"
+ "fmla v17.8h, v7.8h, v2.h[3]\n"
+ "fmla v21.8h, v7.8h, v3.h[3]\n"
+ "fmla v25.8h, v7.8h, v4.h[3]\n"
+ "fmla v29.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.8h, v6.8h, v0.h[3]\n"
+ "fmla v14.8h, v6.8h, v1.h[3]\n"
+ "fmla v18.8h, v6.8h, v2.h[3]\n"
+ "fmla v22.8h, v6.8h, v3.h[3]\n"
+ "fmla v26.8h, v6.8h, v4.h[3]\n"
+ "fmla v30.8h, v6.8h, v5.h[3]\n"
+ "ldr q6, [x15, #0x100]\n"
+ "fmla v11.8h, v7.8h, v0.h[3]\n"
+ "fmla v15.8h, v7.8h, v1.h[3]\n"
+ "fmla v19.8h, v7.8h, v2.h[3]\n"
+ "fmla v23.8h, v7.8h, v3.h[3]\n"
+ "fmla v27.8h, v7.8h, v4.h[3]\n"
+ "fmla v31.8h, v7.8h, v5.h[3]\n"
+ "ldr q7, [x15, #0x110]\n"
+ "fmla v8.8h, v6.8h, v0.h[4]\n"
+ "fmla v12.8h, v6.8h, v1.h[4]\n"
+ "fmla v16.8h, v6.8h, v2.h[4]\n"
+ "fmla v20.8h, v6.8h, v3.h[4]\n"
+ "fmla v24.8h, v6.8h, v4.h[4]\n"
+ "fmla v28.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x120]\n"
+ "fmla v9.8h, v7.8h, v0.h[4]\n"
+ "fmla v13.8h, v7.8h, v1.h[4]\n"
+ "fmla v17.8h, v7.8h, v2.h[4]\n"
+ "fmla v21.8h, v7.8h, v3.h[4]\n"
+ "fmla v25.8h, v7.8h, v4.h[4]\n"
+ "fmla v29.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x130]\n"
+ "fmla v10.8h, v6.8h, v0.h[4]\n"
+ "fmla v14.8h, v6.8h, v1.h[4]\n"
+ "fmla v18.8h, v6.8h, v2.h[4]\n"
+ "fmla v22.8h, v6.8h, v3.h[4]\n"
+ "fmla v26.8h, v6.8h, v4.h[4]\n"
+ "fmla v30.8h, v6.8h, v5.h[4]\n"
+ "ldr q6, [x15, #0x140]\n"
+ "fmla v11.8h, v7.8h, v0.h[4]\n"
+ "fmla v15.8h, v7.8h, v1.h[4]\n"
+ "fmla v19.8h, v7.8h, v2.h[4]\n"
+ "fmla v23.8h, v7.8h, v3.h[4]\n"
+ "fmla v27.8h, v7.8h, v4.h[4]\n"
+ "fmla v31.8h, v7.8h, v5.h[4]\n"
+ "ldr q7, [x15, #0x150]\n"
+ "fmla v8.8h, v6.8h, v0.h[5]\n"
+ "fmla v12.8h, v6.8h, v1.h[5]\n"
+ "fmla v16.8h, v6.8h, v2.h[5]\n"
+ "fmla v20.8h, v6.8h, v3.h[5]\n"
+ "fmla v24.8h, v6.8h, v4.h[5]\n"
+ "fmla v28.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x160]\n"
+ "fmla v9.8h, v7.8h, v0.h[5]\n"
+ "fmla v13.8h, v7.8h, v1.h[5]\n"
+ "fmla v17.8h, v7.8h, v2.h[5]\n"
+ "fmla v21.8h, v7.8h, v3.h[5]\n"
+ "fmla v25.8h, v7.8h, v4.h[5]\n"
+ "fmla v29.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x170]\n"
+ "fmla v10.8h, v6.8h, v0.h[5]\n"
+ "fmla v14.8h, v6.8h, v1.h[5]\n"
+ "fmla v18.8h, v6.8h, v2.h[5]\n"
+ "fmla v22.8h, v6.8h, v3.h[5]\n"
+ "fmla v26.8h, v6.8h, v4.h[5]\n"
+ "fmla v30.8h, v6.8h, v5.h[5]\n"
+ "ldr q6, [x15, #0x180]\n"
+ "fmla v11.8h, v7.8h, v0.h[5]\n"
+ "fmla v15.8h, v7.8h, v1.h[5]\n"
+ "fmla v19.8h, v7.8h, v2.h[5]\n"
+ "fmla v23.8h, v7.8h, v3.h[5]\n"
+ "fmla v27.8h, v7.8h, v4.h[5]\n"
+ "fmla v31.8h, v7.8h, v5.h[5]\n"
+ "ldr q7, [x15, #0x190]\n"
+ "fmla v8.8h, v6.8h, v0.h[6]\n"
+ "fmla v12.8h, v6.8h, v1.h[6]\n"
+ "fmla v16.8h, v6.8h, v2.h[6]\n"
+ "fmla v20.8h, v6.8h, v3.h[6]\n"
+ "fmla v24.8h, v6.8h, v4.h[6]\n"
+ "fmla v28.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1a0]\n"
+ "fmla v9.8h, v7.8h, v0.h[6]\n"
+ "fmla v13.8h, v7.8h, v1.h[6]\n"
+ "fmla v17.8h, v7.8h, v2.h[6]\n"
+ "fmla v21.8h, v7.8h, v3.h[6]\n"
+ "fmla v25.8h, v7.8h, v4.h[6]\n"
+ "fmla v29.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1b0]\n"
+ "fmla v10.8h, v6.8h, v0.h[6]\n"
+ "fmla v14.8h, v6.8h, v1.h[6]\n"
+ "fmla v18.8h, v6.8h, v2.h[6]\n"
+ "fmla v22.8h, v6.8h, v3.h[6]\n"
+ "fmla v26.8h, v6.8h, v4.h[6]\n"
+ "fmla v30.8h, v6.8h, v5.h[6]\n"
+ "ldr q6, [x15, #0x1c0]\n"
+ "fmla v11.8h, v7.8h, v0.h[6]\n"
+ "fmla v15.8h, v7.8h, v1.h[6]\n"
+ "fmla v19.8h, v7.8h, v2.h[6]\n"
+ "fmla v23.8h, v7.8h, v3.h[6]\n"
+ "fmla v27.8h, v7.8h, v4.h[6]\n"
+ "fmla v31.8h, v7.8h, v5.h[6]\n"
+ "ldr q7, [x15, #0x1d0]\n"
+ "fmla v8.8h, v6.8h, v0.h[7]\n"
+ "fmla v12.8h, v6.8h, v1.h[7]\n"
+ "fmla v16.8h, v6.8h, v2.h[7]\n"
+ "fmla v20.8h, v6.8h, v3.h[7]\n"
+ "fmla v24.8h, v6.8h, v4.h[7]\n"
+ "fmla v28.8h, v6.8h, v5.h[7]\n"
+ "ldr q6, [x15, #0x1e0]\n"
+ "fmla v9.8h, v7.8h, v0.h[7]\n"
+ "fmla v13.8h, v7.8h, v1.h[7]\n"
+ "fmla v17.8h, v7.8h, v2.h[7]\n"
+ "fmla v21.8h, v7.8h, v3.h[7]\n"
+ "fmla v25.8h, v7.8h, v4.h[7]\n"
+ "fmla v29.8h, v7.8h, v5.h[7]\n"
+ "ldr q7, [x15, #0x1f0]\n"
+ "fmla v10.8h, v6.8h, v0.h[7]\n"
+ "add x15, x15, #0x200\n"
+ "fmla v14.8h, v6.8h, v1.h[7]\n"
+ "fmla v18.8h, v6.8h, v2.h[7]\n"
+ "fmla v22.8h, v6.8h, v3.h[7]\n"
+ "fmla v26.8h, v6.8h, v4.h[7]\n"
+ "fmla v30.8h, v6.8h, v5.h[7]\n"
+ "fmla v11.8h, v7.8h, v0.h[7]\n"
+ "fmla v15.8h, v7.8h, v1.h[7]\n"
+ "fmla v19.8h, v7.8h, v2.h[7]\n"
+ "fmla v23.8h, v7.8h, v3.h[7]\n"
+ "fmla v27.8h, v7.8h, v4.h[7]\n"
+ "fmla v31.8h, v7.8h, v5.h[7]\n"
+ "279:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 281f\n"
+ "280:" // Height 6: Multiply loop: Odd block loop
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.8h, v6.8h, v0.h[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.8h, v6.8h, v1.h[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.8h, v6.8h, v2.h[0]\n"
+ "fmla v20.8h, v6.8h, v3.h[0]\n"
+ "fmla v24.8h, v6.8h, v4.h[0]\n"
+ "fmla v28.8h, v6.8h, v5.h[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.8h, v7.8h, v0.h[0]\n"
+ "fmla v13.8h, v7.8h, v1.h[0]\n"
+ "fmla v17.8h, v7.8h, v2.h[0]\n"
+ "fmla v21.8h, v7.8h, v3.h[0]\n"
+ "fmla v25.8h, v7.8h, v4.h[0]\n"
+ "fmla v29.8h, v7.8h, v5.h[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.8h, v6.8h, v0.h[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.8h, v6.8h, v1.h[0]\n"
+ "fmla v18.8h, v6.8h, v2.h[0]\n"
+ "fmla v22.8h, v6.8h, v3.h[0]\n"
+ "fmla v26.8h, v6.8h, v4.h[0]\n"
+ "fmla v30.8h, v6.8h, v5.h[0]\n"
+ "fmla v11.8h, v7.8h, v0.h[0]\n"
+ "fmla v15.8h, v7.8h, v1.h[0]\n"
+ "fmla v19.8h, v7.8h, v2.h[0]\n"
+ "fmla v23.8h, v7.8h, v3.h[0]\n"
+ "fmla v27.8h, v7.8h, v4.h[0]\n"
+ "fmla v31.8h, v7.8h, v5.h[0]\n"
+ "cbnz x11, 280b\n"
+ "281:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 274b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 282f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.8h }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.8h }, [x19]\n"
+ "fmin v8.8h, v8.8h, v0.8h\n"
+ "fmin v9.8h, v9.8h, v0.8h\n"
+ "fmin v10.8h, v10.8h, v0.8h\n"
+ "fmin v11.8h, v11.8h, v0.8h\n"
+ "fmax v8.8h, v8.8h, v1.8h\n"
+ "fmax v9.8h, v9.8h, v1.8h\n"
+ "fmax v10.8h, v10.8h, v1.8h\n"
+ "fmax v11.8h, v11.8h, v1.8h\n"
+ "fmin v12.8h, v12.8h, v0.8h\n"
+ "fmin v13.8h, v13.8h, v0.8h\n"
+ "fmin v14.8h, v14.8h, v0.8h\n"
+ "fmax v12.8h, v12.8h, v1.8h\n"
+ "fmax v13.8h, v13.8h, v1.8h\n"
+ "fmax v14.8h, v14.8h, v1.8h\n"
+ "fmin v15.8h, v15.8h, v0.8h\n"
+ "fmin v16.8h, v16.8h, v0.8h\n"
+ "fmin v17.8h, v17.8h, v0.8h\n"
+ "fmax v15.8h, v15.8h, v1.8h\n"
+ "fmax v16.8h, v16.8h, v1.8h\n"
+ "fmax v17.8h, v17.8h, v1.8h\n"
+ "fmin v18.8h, v18.8h, v0.8h\n"
+ "fmin v19.8h, v19.8h, v0.8h\n"
+ "fmin v20.8h, v20.8h, v0.8h\n"
+ "fmax v18.8h, v18.8h, v1.8h\n"
+ "fmax v19.8h, v19.8h, v1.8h\n"
+ "fmax v20.8h, v20.8h, v1.8h\n"
+ "fmin v21.8h, v21.8h, v0.8h\n"
+ "fmin v22.8h, v22.8h, v0.8h\n"
+ "fmin v23.8h, v23.8h, v0.8h\n"
+ "fmax v21.8h, v21.8h, v1.8h\n"
+ "fmax v22.8h, v22.8h, v1.8h\n"
+ "fmax v23.8h, v23.8h, v1.8h\n"
+ "fmin v24.8h, v24.8h, v0.8h\n"
+ "fmin v25.8h, v25.8h, v0.8h\n"
+ "fmin v26.8h, v26.8h, v0.8h\n"
+ "fmax v24.8h, v24.8h, v1.8h\n"
+ "fmax v25.8h, v25.8h, v1.8h\n"
+ "fmax v26.8h, v26.8h, v1.8h\n"
+ "fmin v27.8h, v27.8h, v0.8h\n"
+ "fmin v28.8h, v28.8h, v0.8h\n"
+ "fmin v29.8h, v29.8h, v0.8h\n"
+ "fmax v27.8h, v27.8h, v1.8h\n"
+ "fmax v28.8h, v28.8h, v1.8h\n"
+ "fmax v29.8h, v29.8h, v1.8h\n"
+ "fmin v30.8h, v30.8h, v0.8h\n"
+ "fmin v31.8h, v31.8h, v0.8h\n"
+ "fmax v30.8h, v30.8h, v1.8h\n"
+ "fmax v31.8h, v31.8h, v1.8h\n"
+ "282:" // Height 6: No activation
+ "cmp x16, #0x20\n"
+ "bge 299f\n"
+ "tbz x16, #4, 290f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v9.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v13.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v17.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v21.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v25.8h }, [x23], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "st1 { v29.8h }, [x21], #0x10\n"
+ "tbz x16, #3, 286f\n"
+ "st1 { v10.8h }, [x13], #0x10\n"
+ "st1 { v14.8h }, [x9], #0x10\n"
+ "st1 { v18.8h }, [x27], #0x10\n"
+ "st1 { v22.8h }, [x25], #0x10\n"
+ "st1 { v26.8h }, [x23], #0x10\n"
+ "st1 { v30.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 284f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x16, #1, 283f\n"
+ "st1 { v11.s }[2], [x13], #0x4\n"
+ "st1 { v15.s }[2], [x9], #0x4\n"
+ "st1 { v19.s }[2], [x27], #0x4\n"
+ "st1 { v23.s }[2], [x25], #0x4\n"
+ "st1 { v27.s }[2], [x23], #0x4\n"
+ "st1 { v31.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v11.h }[6], [x13]\n"
+ "st1 { v15.h }[6], [x9]\n"
+ "st1 { v19.h }[6], [x27]\n"
+ "st1 { v23.h }[6], [x25]\n"
+ "st1 { v27.h }[6], [x23]\n"
+ "st1 { v31.h }[6], [x21]\n"
+ "b 298f\n"
+ "283:" // Height 6: Partial direct writeback: partial_1_28
+ "tbz x16, #0, 298f\n"
+ "st1 { v11.h }[4], [x13]\n"
+ "st1 { v15.h }[4], [x9]\n"
+ "st1 { v19.h }[4], [x27]\n"
+ "st1 { v23.h }[4], [x25]\n"
+ "st1 { v27.h }[4], [x23]\n"
+ "st1 { v31.h }[4], [x21]\n"
+ "b 298f\n"
+ "284:" // Height 6: Partial direct writeback: partial_2_24
+ "tbz x16, #1, 285f\n"
+ "str s11, [x13], #0x4\n"
+ "str s15, [x9], #0x4\n"
+ "str s19, [x27], #0x4\n"
+ "str s23, [x25], #0x4\n"
+ "str s27, [x23], #0x4\n"
+ "str s31, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v11.h }[2], [x13]\n"
+ "st1 { v15.h }[2], [x9]\n"
+ "st1 { v19.h }[2], [x27]\n"
+ "st1 { v23.h }[2], [x25]\n"
+ "st1 { v27.h }[2], [x23]\n"
+ "st1 { v31.h }[2], [x21]\n"
+ "b 298f\n"
+ "285:" // Height 6: Partial direct writeback: partial_1_24
+ "tbz x16, #0, 298f\n"
+ "str h11, [x13, #0x0]\n"
+ "str h15, [x9, #0x0]\n"
+ "str h19, [x27, #0x0]\n"
+ "str h23, [x25, #0x0]\n"
+ "str h27, [x23, #0x0]\n"
+ "str h31, [x21, #0x0]\n"
+ "b 298f\n"
+ "286:" // Height 6: Partial direct writeback: partial_4_16
+ "tbz x16, #2, 288f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x16, #1, 287f\n"
+ "st1 { v10.s }[2], [x13], #0x4\n"
+ "st1 { v14.s }[2], [x9], #0x4\n"
+ "st1 { v18.s }[2], [x27], #0x4\n"
+ "st1 { v22.s }[2], [x25], #0x4\n"
+ "st1 { v26.s }[2], [x23], #0x4\n"
+ "st1 { v30.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v10.h }[6], [x13]\n"
+ "st1 { v14.h }[6], [x9]\n"
+ "st1 { v18.h }[6], [x27]\n"
+ "st1 { v22.h }[6], [x25]\n"
+ "st1 { v26.h }[6], [x23]\n"
+ "st1 { v30.h }[6], [x21]\n"
+ "b 298f\n"
+ "287:" // Height 6: Partial direct writeback: partial_1_20
+ "tbz x16, #0, 298f\n"
+ "st1 { v10.h }[4], [x13]\n"
+ "st1 { v14.h }[4], [x9]\n"
+ "st1 { v18.h }[4], [x27]\n"
+ "st1 { v22.h }[4], [x25]\n"
+ "st1 { v26.h }[4], [x23]\n"
+ "st1 { v30.h }[4], [x21]\n"
+ "b 298f\n"
+ "288:" // Height 6: Partial direct writeback: partial_2_16
+ "tbz x16, #1, 289f\n"
+ "str s10, [x13], #0x4\n"
+ "str s14, [x9], #0x4\n"
+ "str s18, [x27], #0x4\n"
+ "str s22, [x25], #0x4\n"
+ "str s26, [x23], #0x4\n"
+ "str s30, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v10.h }[2], [x13]\n"
+ "st1 { v14.h }[2], [x9]\n"
+ "st1 { v18.h }[2], [x27]\n"
+ "st1 { v22.h }[2], [x25]\n"
+ "st1 { v26.h }[2], [x23]\n"
+ "st1 { v30.h }[2], [x21]\n"
+ "b 298f\n"
+ "289:" // Height 6: Partial direct writeback: partial_1_16
+ "tbz x16, #0, 298f\n"
+ "str h10, [x13, #0x0]\n"
+ "str h14, [x9, #0x0]\n"
+ "str h18, [x27, #0x0]\n"
+ "str h22, [x25, #0x0]\n"
+ "str h26, [x23, #0x0]\n"
+ "str h30, [x21, #0x0]\n"
+ "b 298f\n"
+ "290:" // Height 6: Partial direct writeback: partial_8_0
+ "tbz x16, #3, 294f\n"
+ "st1 { v8.8h }, [x13], #0x10\n"
+ "st1 { v12.8h }, [x9], #0x10\n"
+ "st1 { v16.8h }, [x27], #0x10\n"
+ "st1 { v20.8h }, [x25], #0x10\n"
+ "st1 { v24.8h }, [x23], #0x10\n"
+ "st1 { v28.8h }, [x21], #0x10\n"
+ "tbz x16, #2, 292f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x16, #1, 291f\n"
+ "st1 { v9.s }[2], [x13], #0x4\n"
+ "st1 { v13.s }[2], [x9], #0x4\n"
+ "st1 { v17.s }[2], [x27], #0x4\n"
+ "st1 { v21.s }[2], [x25], #0x4\n"
+ "st1 { v25.s }[2], [x23], #0x4\n"
+ "st1 { v29.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v9.h }[6], [x13]\n"
+ "st1 { v13.h }[6], [x9]\n"
+ "st1 { v17.h }[6], [x27]\n"
+ "st1 { v21.h }[6], [x25]\n"
+ "st1 { v25.h }[6], [x23]\n"
+ "st1 { v29.h }[6], [x21]\n"
+ "b 298f\n"
+ "291:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 298f\n"
+ "st1 { v9.h }[4], [x13]\n"
+ "st1 { v13.h }[4], [x9]\n"
+ "st1 { v17.h }[4], [x27]\n"
+ "st1 { v21.h }[4], [x25]\n"
+ "st1 { v25.h }[4], [x23]\n"
+ "st1 { v29.h }[4], [x21]\n"
+ "b 298f\n"
+ "292:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 293f\n"
+ "str s9, [x13], #0x4\n"
+ "str s13, [x9], #0x4\n"
+ "str s17, [x27], #0x4\n"
+ "str s21, [x25], #0x4\n"
+ "str s25, [x23], #0x4\n"
+ "str s29, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v9.h }[2], [x13]\n"
+ "st1 { v13.h }[2], [x9]\n"
+ "st1 { v17.h }[2], [x27]\n"
+ "st1 { v21.h }[2], [x25]\n"
+ "st1 { v25.h }[2], [x23]\n"
+ "st1 { v29.h }[2], [x21]\n"
+ "b 298f\n"
+ "293:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 298f\n"
+ "str h9, [x13, #0x0]\n"
+ "str h13, [x9, #0x0]\n"
+ "str h17, [x27, #0x0]\n"
+ "str h21, [x25, #0x0]\n"
+ "str h25, [x23, #0x0]\n"
+ "str h29, [x21, #0x0]\n"
+ "b 298f\n"
+ "294:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 296f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x16, #1, 295f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v8.h }[6], [x13]\n"
+ "st1 { v12.h }[6], [x9]\n"
+ "st1 { v16.h }[6], [x27]\n"
+ "st1 { v20.h }[6], [x25]\n"
+ "st1 { v24.h }[6], [x23]\n"
+ "st1 { v28.h }[6], [x21]\n"
+ "b 298f\n"
+ "295:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 298f\n"
+ "st1 { v8.h }[4], [x13]\n"
+ "st1 { v12.h }[4], [x9]\n"
+ "st1 { v16.h }[4], [x27]\n"
+ "st1 { v20.h }[4], [x25]\n"
+ "st1 { v24.h }[4], [x23]\n"
+ "st1 { v28.h }[4], [x21]\n"
+ "b 298f\n"
+ "296:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 297f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x16, #0, 298f\n"
+ "st1 { v8.h }[2], [x13]\n"
+ "st1 { v12.h }[2], [x9]\n"
+ "st1 { v16.h }[2], [x27]\n"
+ "st1 { v20.h }[2], [x25]\n"
+ "st1 { v24.h }[2], [x23]\n"
+ "st1 { v28.h }[2], [x21]\n"
+ "b 298f\n"
+ "297:" // Height 6: Partial direct writeback: partial_1_0
+ "str h8, [x13, #0x0]\n"
+ "str h12, [x9, #0x0]\n"
+ "str h16, [x27, #0x0]\n"
+ "str h20, [x25, #0x0]\n"
+ "str h24, [x23, #0x0]\n"
+ "str h28, [x21, #0x0]\n"
+ "298:" // Height 6: Partial direct writeback: Done
+ "b 300f\n"
+ "299:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "300:" // Height 6: Writeback done
+ "subs x16, x16, #0x20\n"
+ "bgt 253b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 302f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 301f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "301:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "302:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
deleted file mode 100644
index 94fcd1064e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/a55.cpp
+++ /dev/null
@@ -1,2427 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_a55(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[16];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (16 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- float result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ins v1.d[1], temploadreg1\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v2.d[1], temploadreg2\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q3, [a_ptr3]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v28.16b, v16.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "mov v29.16b, v17.16b\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "mov v30.16b, v18.16b\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "mov v31.16b, v19.16b\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d7, [a_ptr3]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v7.d[1], temploadreg3\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ins v0.d[1], temploadreg0\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ins v1.d[1], temploadreg1\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ins v2.d[1], temploadreg2\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ins v3.d[1], temploadreg3\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr d5, [a_ptr1]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr d6, [a_ptr2]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr d7, [a_ptr3]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ins v4.d[1], temploadreg0\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ins v5.d[1], temploadreg1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ins v6.d[1], temploadreg2\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ins v7.d[1], temploadreg3\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ins v8.d[1], temploadreg0\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ins v9.d[1], temploadreg1\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ins v10.d[1], temploadreg2\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ins v11.d[1], temploadreg3\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ins v12.d[1], temploadreg0\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ins v13.d[1], temploadreg1\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ins v14.d[1], temploadreg2\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ins v15.d[1], temploadreg3\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v14.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
- "str q23, [c_ptr1, #0x30]\n"
- "fmin v29.4s, v29.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v15.4s\n"
- "str q24, [c_ptr2]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
deleted file mode 100644
index 016bef4b9d..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/generic.cpp
+++ /dev/null
@@ -1,1802 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[16];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (16 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- float result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q3, [a_ptr3]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v27.16b, v19.16b\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "mov v28.16b, v16.16b\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "mov v29.16b, v17.16b\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "mov v30.16b, v18.16b\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "mov v31.16b, v19.16b\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v4.s[1]\n"
- "fmla v20.4s, v12.4s, v5.s[1]\n"
- "fmla v24.4s, v12.4s, v6.s[1]\n"
- "fmla v28.4s, v12.4s, v7.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v4.s[1]\n"
- "fmla v21.4s, v13.4s, v5.s[1]\n"
- "fmla v25.4s, v13.4s, v6.s[1]\n"
- "fmla v29.4s, v13.4s, v7.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v4.s[1]\n"
- "fmla v22.4s, v14.4s, v5.s[1]\n"
- "fmla v26.4s, v14.4s, v6.s[1]\n"
- "fmla v30.4s, v14.4s, v7.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v4.s[1]\n"
- "fmla v23.4s, v15.4s, v5.s[1]\n"
- "fmla v27.4s, v15.4s, v6.s[1]\n"
- "fmla v31.4s, v15.4s, v7.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "fmla v16.4s, v12.4s, v4.s[3]\n"
- "fmla v20.4s, v12.4s, v5.s[3]\n"
- "fmla v24.4s, v12.4s, v6.s[3]\n"
- "fmla v28.4s, v12.4s, v7.s[3]\n"
- "fmla v17.4s, v13.4s, v4.s[3]\n"
- "fmla v21.4s, v13.4s, v5.s[3]\n"
- "fmla v25.4s, v13.4s, v6.s[3]\n"
- "fmla v29.4s, v13.4s, v7.s[3]\n"
- "fmla v18.4s, v14.4s, v4.s[3]\n"
- "fmla v22.4s, v14.4s, v5.s[3]\n"
- "fmla v26.4s, v14.4s, v6.s[3]\n"
- "fmla v30.4s, v14.4s, v7.s[3]\n"
- "fmla v19.4s, v15.4s, v4.s[3]\n"
- "fmla v23.4s, v15.4s, v5.s[3]\n"
- "fmla v27.4s, v15.4s, v6.s[3]\n"
- "fmla v31.4s, v15.4s, v7.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v12.4s, v0.s[1]\n"
- "fmla v20.4s, v12.4s, v1.s[1]\n"
- "fmla v24.4s, v12.4s, v2.s[1]\n"
- "fmla v28.4s, v12.4s, v3.s[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v13.4s, v0.s[1]\n"
- "fmla v21.4s, v13.4s, v1.s[1]\n"
- "fmla v25.4s, v13.4s, v2.s[1]\n"
- "fmla v29.4s, v13.4s, v3.s[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v14.4s, v0.s[1]\n"
- "fmla v22.4s, v14.4s, v1.s[1]\n"
- "fmla v26.4s, v14.4s, v2.s[1]\n"
- "fmla v30.4s, v14.4s, v3.s[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v15.4s, v0.s[1]\n"
- "fmla v23.4s, v15.4s, v1.s[1]\n"
- "fmla v27.4s, v15.4s, v2.s[1]\n"
- "fmla v31.4s, v15.4s, v3.s[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "fmla v16.4s, v12.4s, v0.s[3]\n"
- "fmla v20.4s, v12.4s, v1.s[3]\n"
- "fmla v24.4s, v12.4s, v2.s[3]\n"
- "fmla v28.4s, v12.4s, v3.s[3]\n"
- "fmla v17.4s, v13.4s, v0.s[3]\n"
- "fmla v21.4s, v13.4s, v1.s[3]\n"
- "fmla v25.4s, v13.4s, v2.s[3]\n"
- "fmla v29.4s, v13.4s, v3.s[3]\n"
- "fmla v18.4s, v14.4s, v0.s[3]\n"
- "fmla v22.4s, v14.4s, v1.s[3]\n"
- "fmla v26.4s, v14.4s, v2.s[3]\n"
- "fmla v30.4s, v14.4s, v3.s[3]\n"
- "fmla v19.4s, v15.4s, v0.s[3]\n"
- "fmla v23.4s, v15.4s, v1.s[3]\n"
- "fmla v27.4s, v15.4s, v2.s[3]\n"
- "fmla v31.4s, v15.4s, v3.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v14.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
- "str q23, [c_ptr1, #0x30]\n"
- "fmin v29.4s, v29.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v15.4s\n"
- "str q24, [c_ptr2]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
deleted file mode 100644
index 3f1df76a6a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4/x1.cpp
+++ /dev/null
@@ -1,1810 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_16x4_x1(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[16];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (16 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- float result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v26.16b, v18.16b\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov v27.16b, v19.16b\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "ldr q16, [%[biasptr]]\n"
- "ldr q17, [%[biasptr], #0x10]\n"
- "ldr q18, [%[biasptr], #0x20]\n"
- "ldr q19, [%[biasptr], #0x30]\n"
- "mov v20.16b, v16.16b\n"
- "ldr q0, [%[a_ptr0]]\n"
- "mov v21.16b, v17.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v22.16b, v18.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v23.16b, v19.16b\n"
- "ldr q3, [a_ptr3]\n"
- "mov v24.16b, v16.16b\n"
- "ldr q8, [%[b_ptr0]]\n"
- "mov v25.16b, v17.16b\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "mov v26.16b, v18.16b\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "mov v27.16b, v19.16b\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov v28.16b, v16.16b\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "mov v29.16b, v17.16b\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "mov v30.16b, v18.16b\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov v31.16b, v19.16b\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "fmla v28.4s, v8.4s, v3.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "fmla v29.4s, v9.4s, v3.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "fmla v30.4s, v10.4s, v3.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "fmla v31.4s, v11.4s, v3.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v3.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v29.4s, v9.4s, v3.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v30.4s, v10.4s, v3.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v31.4s, v11.4s, v3.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "fmla v28.4s, v8.4s, v7.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v29.4s, v9.4s, v7.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "fmla v30.4s, v10.4s, v7.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "fmla v31.4s, v11.4s, v7.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "fmla v28.4s, v8.4s, v7.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "fmla v29.4s, v9.4s, v7.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "fmla v30.4s, v10.4s, v7.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "fmla v31.4s, v11.4s, v7.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "ldr q5, [a_ptr1]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "ldr q6, [a_ptr2]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q7, [a_ptr3]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "fmla v28.4s, v8.4s, v3.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "fmla v29.4s, v9.4s, v3.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "fmla v30.4s, v10.4s, v3.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "fmla v31.4s, v11.4s, v3.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v3.s[3]\n"
- "ldr q8, [%[b_ptr0], #-0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v29.4s, v9.4s, v3.s[3]\n"
- "ldr q9, [%[b_ptr0], #-0x30]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v30.4s, v10.4s, v3.s[3]\n"
- "ldr q10, [%[b_ptr0], #-0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "fmla v31.4s, v11.4s, v3.s[3]\n"
- "ldr q11, [%[b_ptr0], #-0x10]\n"
- "fmla v16.4s, v8.4s, v4.s[0]\n"
- "fmla v20.4s, v8.4s, v5.s[0]\n"
- "fmla v24.4s, v8.4s, v6.s[0]\n"
- "fmla v28.4s, v8.4s, v7.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v4.s[0]\n"
- "fmla v21.4s, v9.4s, v5.s[0]\n"
- "fmla v25.4s, v9.4s, v6.s[0]\n"
- "fmla v29.4s, v9.4s, v7.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v4.s[0]\n"
- "fmla v22.4s, v10.4s, v5.s[0]\n"
- "fmla v26.4s, v10.4s, v6.s[0]\n"
- "fmla v30.4s, v10.4s, v7.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v4.s[0]\n"
- "fmla v23.4s, v11.4s, v5.s[0]\n"
- "fmla v27.4s, v11.4s, v6.s[0]\n"
- "fmla v31.4s, v11.4s, v7.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v4.s[1]\n"
- "fmla v20.4s, v8.4s, v5.s[1]\n"
- "fmla v24.4s, v8.4s, v6.s[1]\n"
- "fmla v28.4s, v8.4s, v7.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v4.s[1]\n"
- "fmla v21.4s, v9.4s, v5.s[1]\n"
- "fmla v25.4s, v9.4s, v6.s[1]\n"
- "fmla v29.4s, v9.4s, v7.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v4.s[1]\n"
- "fmla v22.4s, v10.4s, v5.s[1]\n"
- "fmla v26.4s, v10.4s, v6.s[1]\n"
- "fmla v30.4s, v10.4s, v7.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[1]\n"
- "fmla v23.4s, v11.4s, v5.s[1]\n"
- "fmla v27.4s, v11.4s, v6.s[1]\n"
- "fmla v31.4s, v11.4s, v7.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v4.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v5.s[2]\n"
- "fmla v24.4s, v8.4s, v6.s[2]\n"
- "fmla v28.4s, v8.4s, v7.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v4.s[2]\n"
- "fmla v21.4s, v9.4s, v5.s[2]\n"
- "fmla v25.4s, v9.4s, v6.s[2]\n"
- "fmla v29.4s, v9.4s, v7.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v4.s[2]\n"
- "fmla v22.4s, v10.4s, v5.s[2]\n"
- "fmla v26.4s, v10.4s, v6.s[2]\n"
- "fmla v30.4s, v10.4s, v7.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v4.s[2]\n"
- "fmla v23.4s, v11.4s, v5.s[2]\n"
- "fmla v27.4s, v11.4s, v6.s[2]\n"
- "fmla v31.4s, v11.4s, v7.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v4.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v5.s[3]\n"
- "fmla v24.4s, v8.4s, v6.s[3]\n"
- "fmla v28.4s, v8.4s, v7.s[3]\n"
- "fmla v17.4s, v9.4s, v4.s[3]\n"
- "fmla v21.4s, v9.4s, v5.s[3]\n"
- "fmla v25.4s, v9.4s, v6.s[3]\n"
- "fmla v29.4s, v9.4s, v7.s[3]\n"
- "fmla v18.4s, v10.4s, v4.s[3]\n"
- "fmla v22.4s, v10.4s, v5.s[3]\n"
- "fmla v26.4s, v10.4s, v6.s[3]\n"
- "fmla v30.4s, v10.4s, v7.s[3]\n"
- "fmla v19.4s, v11.4s, v4.s[3]\n"
- "fmla v23.4s, v11.4s, v5.s[3]\n"
- "fmla v27.4s, v11.4s, v6.s[3]\n"
- "fmla v31.4s, v11.4s, v7.s[3]\n"
- "b 5f\n"
- "4:\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "fmla v16.4s, v8.4s, v0.s[1]\n"
- "fmla v20.4s, v8.4s, v1.s[1]\n"
- "fmla v24.4s, v8.4s, v2.s[1]\n"
- "fmla v28.4s, v8.4s, v3.s[1]\n"
- "ldr q8, [%[b_ptr0], #0x40]\n"
- "fmla v17.4s, v9.4s, v0.s[1]\n"
- "fmla v21.4s, v9.4s, v1.s[1]\n"
- "fmla v25.4s, v9.4s, v2.s[1]\n"
- "fmla v29.4s, v9.4s, v3.s[1]\n"
- "ldr q9, [%[b_ptr0], #0x50]\n"
- "fmla v18.4s, v10.4s, v0.s[1]\n"
- "fmla v22.4s, v10.4s, v1.s[1]\n"
- "fmla v26.4s, v10.4s, v2.s[1]\n"
- "fmla v30.4s, v10.4s, v3.s[1]\n"
- "ldr q10, [%[b_ptr0], #0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[1]\n"
- "fmla v23.4s, v11.4s, v1.s[1]\n"
- "fmla v27.4s, v11.4s, v2.s[1]\n"
- "fmla v31.4s, v11.4s, v3.s[1]\n"
- "ldr q11, [%[b_ptr0], #0x70]\n"
- "fmla v16.4s, v8.4s, v0.s[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- "fmla v20.4s, v8.4s, v1.s[2]\n"
- "fmla v24.4s, v8.4s, v2.s[2]\n"
- "fmla v28.4s, v8.4s, v3.s[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- "fmla v17.4s, v9.4s, v0.s[2]\n"
- "fmla v21.4s, v9.4s, v1.s[2]\n"
- "fmla v25.4s, v9.4s, v2.s[2]\n"
- "fmla v29.4s, v9.4s, v3.s[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- "fmla v18.4s, v10.4s, v0.s[2]\n"
- "fmla v22.4s, v10.4s, v1.s[2]\n"
- "fmla v26.4s, v10.4s, v2.s[2]\n"
- "fmla v30.4s, v10.4s, v3.s[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "fmla v19.4s, v11.4s, v0.s[2]\n"
- "fmla v23.4s, v11.4s, v1.s[2]\n"
- "fmla v27.4s, v11.4s, v2.s[2]\n"
- "fmla v31.4s, v11.4s, v3.s[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "fmla v16.4s, v8.4s, v0.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #-0x40\n"
- "fmla v20.4s, v8.4s, v1.s[3]\n"
- "fmla v24.4s, v8.4s, v2.s[3]\n"
- "fmla v28.4s, v8.4s, v3.s[3]\n"
- "fmla v17.4s, v9.4s, v0.s[3]\n"
- "fmla v21.4s, v9.4s, v1.s[3]\n"
- "fmla v25.4s, v9.4s, v2.s[3]\n"
- "fmla v29.4s, v9.4s, v3.s[3]\n"
- "fmla v18.4s, v10.4s, v0.s[3]\n"
- "fmla v22.4s, v10.4s, v1.s[3]\n"
- "fmla v26.4s, v10.4s, v2.s[3]\n"
- "fmla v30.4s, v10.4s, v3.s[3]\n"
- "fmla v19.4s, v11.4s, v0.s[3]\n"
- "fmla v23.4s, v11.4s, v1.s[3]\n"
- "fmla v27.4s, v11.4s, v2.s[3]\n"
- "fmla v31.4s, v11.4s, v3.s[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v16.4s, v8.4s, v0.s[0]\n"
- "ldr s1, [a_ptr1]\n"
- "fmla v17.4s, v9.4s, v0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v18.4s, v10.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v20.4s, v8.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v21.4s, v9.4s, v1.s[0]\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v24.4s, v8.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v25.4s, v9.4s, v2.s[0]\n"
- "fmla v28.4s, v8.4s, v3.s[0]\n"
- "fmla v29.4s, v9.4s, v3.s[0]\n"
- "fmla v22.4s, v10.4s, v1.s[0]\n"
- "fmla v26.4s, v10.4s, v2.s[0]\n"
- "fmla v30.4s, v10.4s, v3.s[0]\n"
- "fmla v19.4s, v11.4s, v0.s[0]\n"
- "fmla v23.4s, v11.4s, v1.s[0]\n"
- "fmla v27.4s, v11.4s, v2.s[0]\n"
- "fmla v31.4s, v11.4s, v3.s[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "ld1r {v14.4s}, [%[minptr]]\n"
- "ld1r {v15.4s}, [%[maxptr]]\n"
- "fmax v16.4s, v16.4s, v14.4s\n"
- "fmax v17.4s, v17.4s, v14.4s\n"
- "fmax v18.4s, v18.4s, v14.4s\n"
- "fmax v19.4s, v19.4s, v14.4s\n"
- "fmin v16.4s, v16.4s, v15.4s\n"
- "fmin v17.4s, v17.4s, v15.4s\n"
- "fmin v18.4s, v18.4s, v15.4s\n"
- "fmin v19.4s, v19.4s, v15.4s\n"
- "str q16, [%[c_ptr0]]\n"
- "fmax v20.4s, v20.4s, v14.4s\n"
- "fmax v21.4s, v21.4s, v14.4s\n"
- "fmax v22.4s, v22.4s, v14.4s\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "fmax v23.4s, v23.4s, v14.4s\n"
- "fmin v20.4s, v20.4s, v15.4s\n"
- "fmin v21.4s, v21.4s, v15.4s\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "fmin v22.4s, v22.4s, v15.4s\n"
- "fmin v23.4s, v23.4s, v15.4s\n"
- "fmax v24.4s, v24.4s, v14.4s\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "fmax v25.4s, v25.4s, v14.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "fmax v26.4s, v26.4s, v14.4s\n"
- "str q20, [c_ptr1]\n"
- "fmin v24.4s, v24.4s, v15.4s\n"
- "fmin v25.4s, v25.4s, v15.4s\n"
- "fmax v27.4s, v27.4s, v14.4s\n"
- "str q21, [c_ptr1, #0x10]\n"
- "fmin v26.4s, v26.4s, v15.4s\n"
- "fmax v28.4s, v28.4s, v14.4s\n"
- "fmax v29.4s, v29.4s, v14.4s\n"
- "str q22, [c_ptr1, #0x20]\n"
- "fmin v27.4s, v27.4s, v15.4s\n"
- "fmax v30.4s, v30.4s, v14.4s\n"
- "fmin v28.4s, v28.4s, v15.4s\n"
- "str q23, [c_ptr1, #0x30]\n"
- "fmin v29.4s, v29.4s, v15.4s\n"
- "fmax v31.4s, v31.4s, v14.4s\n"
- "fmin v30.4s, v30.4s, v15.4s\n"
- "str q24, [c_ptr2]\n"
- "fmin v31.4s, v31.4s, v15.4s\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
deleted file mode 100644
index 7442d258ec..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_4x8/generic.cpp
+++ /dev/null
@@ -1,1934 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long blocks_count = K / 1;
- float nullbias[4];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 8) {
- if (rows_to_compute % 8) {
- rows_to_compute = 8 - 1;
- } else {
- rows_to_compute = 8;
- }
- }
-
- for (int x0=0; x0<N; x0+=4ul) {
- const long width = std::min((unsigned long)N-x0, 4ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 4);
- float result_buffer[32];
- const unsigned long ldcb = (use_result_buffer ? 4 : ldc) * sizeof(float);
- float *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 8); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 4 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "ldr q24, [%[biasptr]]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q16, [%[b_ptr0]]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q16, [%[b_ptr0]]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "ldr q9, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "ldr q10, [a_ptr2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- "str q26, [c_ptr2]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- "str q26, [c_ptr2]\n"
- "str q27, [c_ptr3]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 5:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "c_ptr1 .req X4\n"
- "c_ptr2 .req X5\n"
- "c_ptr3 .req X6\n"
- "c_ptr4 .req X7\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "str q25, [c_ptr1]\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "str q26, [c_ptr2]\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- case 6:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "c_ptr1 .req X5\n"
- "c_ptr2 .req X6\n"
- "c_ptr3 .req X7\n"
- "c_ptr4 .req X8\n"
- "c_ptr5 .req X9\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "mov v29.16b, v24.16b\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "ldr q5, [a_ptr5]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q5, [a_ptr5, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "prfm PSTL1KEEP, [c_ptr5]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "ldr s5, [a_ptr5]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "add a_ptr5, a_ptr5, #0x4\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "fmax v29.4s, v29.4s, v22.4s\n"
- "str q25, [c_ptr1]\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "fmin v29.4s, v29.4s, v23.4s\n"
- "str q26, [c_ptr2]\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- "str q29, [c_ptr5]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- case 7:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "c_ptr1 .req X6\n"
- "c_ptr2 .req X7\n"
- "c_ptr3 .req X8\n"
- "c_ptr4 .req X9\n"
- "c_ptr5 .req X10\n"
- "c_ptr6 .req X11\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "mov v29.16b, v24.16b\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "mov v30.16b, v24.16b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "ldr q5, [a_ptr5]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "ldr q6, [a_ptr6]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q5, [a_ptr5, #-0x10]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q6, [a_ptr6, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "prfm PSTL1KEEP, [c_ptr5]\n"
- "prfm PSTL1KEEP, [c_ptr6]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "ldr s5, [a_ptr5]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "add a_ptr5, a_ptr5, #0x4\n"
- "ldr s6, [a_ptr6]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "add a_ptr6, a_ptr6, #0x4\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "fmax v29.4s, v29.4s, v22.4s\n"
- "str q25, [c_ptr1]\n"
- "fmax v30.4s, v30.4s, v22.4s\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "fmin v29.4s, v29.4s, v23.4s\n"
- "str q26, [c_ptr2]\n"
- "fmin v30.4s, v30.4s, v23.4s\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- "str q29, [c_ptr5]\n"
- "str q30, [c_ptr6]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
- );
- break;
- default:
- case 8:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "a_ptr7 .req X6\n"
- "c_ptr1 .req X7\n"
- "c_ptr2 .req X8\n"
- "c_ptr3 .req X9\n"
- "c_ptr4 .req X10\n"
- "c_ptr5 .req X11\n"
- "c_ptr6 .req X12\n"
- "c_ptr7 .req X13\n"
- "ldr q24, [%[biasptr]]\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "mov v25.16b, v24.16b\n"
- "ldr q1, [a_ptr1]\n"
- "mov v26.16b, v24.16b\n"
- "ldr q2, [a_ptr2]\n"
- "mov v27.16b, v24.16b\n"
- "ldr q16, [%[b_ptr0]]\n"
- "mov v28.16b, v24.16b\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "mov v29.16b, v24.16b\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "mov v30.16b, v24.16b\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "mov v31.16b, v24.16b\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "ldr q4, [a_ptr4]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "ldr q5, [a_ptr5]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "ldr q6, [a_ptr6]\n"
- "add a_ptr7, a_ptr6, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "ldr q7, [a_ptr7]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add c_ptr7, c_ptr6, %[ldc]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q15, [a_ptr7]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla v31.4s, v17.4s, v7.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "add a_ptr7, a_ptr7, #0x20\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- "fmla v31.4s, v18.4s, v7.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "ldr q4, [a_ptr4, #-0x10]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "ldr q5, [a_ptr5, #-0x10]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "ldr q6, [a_ptr6, #-0x10]\n"
- "fmla v31.4s, v19.4s, v7.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "ldr q7, [a_ptr7, #-0x10]\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "fmla v31.4s, v16.4s, v15.s[0]\n"
- "ldr q16, [%[b_ptr0], #0x40]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "fmla v31.4s, v17.4s, v15.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x50]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "fmla v31.4s, v18.4s, v15.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x60]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "fmla v31.4s, v19.4s, v15.s[3]\n"
- "b.ne 2b\n"
- "1:\n"
- "ldr q19, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "prfm PSTL1KEEP, [c_ptr4]\n"
- "prfm PSTL1KEEP, [c_ptr5]\n"
- "prfm PSTL1KEEP, [c_ptr6]\n"
- "prfm PSTL1KEEP, [c_ptr7]\n"
- "cbz %[regs], 3f\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr q8, [%[a_ptr0]]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "ldr q9, [a_ptr1]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "ldr q10, [a_ptr2]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "ldr q11, [a_ptr3]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "ldr q12, [a_ptr4]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "ldr q13, [a_ptr5]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "ldr q14, [a_ptr6]\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "ldr q15, [a_ptr7]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "fmla v31.4s, v17.4s, v7.s[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "fmla v31.4s, v18.4s, v7.s[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "fmla v31.4s, v19.4s, v7.s[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "fmla v24.4s, v16.4s, v8.s[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "fmla v25.4s, v16.4s, v9.s[0]\n"
- "fmla v26.4s, v16.4s, v10.s[0]\n"
- "fmla v27.4s, v16.4s, v11.s[0]\n"
- "fmla v28.4s, v16.4s, v12.s[0]\n"
- "fmla v29.4s, v16.4s, v13.s[0]\n"
- "fmla v30.4s, v16.4s, v14.s[0]\n"
- "fmla v31.4s, v16.4s, v15.s[0]\n"
- "fmla v24.4s, v17.4s, v8.s[1]\n"
- "fmla v25.4s, v17.4s, v9.s[1]\n"
- "fmla v26.4s, v17.4s, v10.s[1]\n"
- "fmla v27.4s, v17.4s, v11.s[1]\n"
- "fmla v28.4s, v17.4s, v12.s[1]\n"
- "fmla v29.4s, v17.4s, v13.s[1]\n"
- "fmla v30.4s, v17.4s, v14.s[1]\n"
- "fmla v31.4s, v17.4s, v15.s[1]\n"
- "fmla v24.4s, v18.4s, v8.s[2]\n"
- "fmla v25.4s, v18.4s, v9.s[2]\n"
- "fmla v26.4s, v18.4s, v10.s[2]\n"
- "fmla v27.4s, v18.4s, v11.s[2]\n"
- "fmla v28.4s, v18.4s, v12.s[2]\n"
- "fmla v29.4s, v18.4s, v13.s[2]\n"
- "fmla v30.4s, v18.4s, v14.s[2]\n"
- "fmla v31.4s, v18.4s, v15.s[2]\n"
- "fmla v24.4s, v19.4s, v8.s[3]\n"
- "fmla v25.4s, v19.4s, v9.s[3]\n"
- "fmla v26.4s, v19.4s, v10.s[3]\n"
- "fmla v27.4s, v19.4s, v11.s[3]\n"
- "fmla v28.4s, v19.4s, v12.s[3]\n"
- "fmla v29.4s, v19.4s, v13.s[3]\n"
- "fmla v30.4s, v19.4s, v14.s[3]\n"
- "fmla v31.4s, v19.4s, v15.s[3]\n"
- "b 4f\n"
- "3:\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "fmla v24.4s, v17.4s, v0.s[1]\n"
- "fmla v25.4s, v17.4s, v1.s[1]\n"
- "fmla v26.4s, v17.4s, v2.s[1]\n"
- "fmla v27.4s, v17.4s, v3.s[1]\n"
- "fmla v28.4s, v17.4s, v4.s[1]\n"
- "fmla v29.4s, v17.4s, v5.s[1]\n"
- "fmla v30.4s, v17.4s, v6.s[1]\n"
- "fmla v31.4s, v17.4s, v7.s[1]\n"
- "fmla v24.4s, v18.4s, v0.s[2]\n"
- "fmla v25.4s, v18.4s, v1.s[2]\n"
- "fmla v26.4s, v18.4s, v2.s[2]\n"
- "fmla v27.4s, v18.4s, v3.s[2]\n"
- "fmla v28.4s, v18.4s, v4.s[2]\n"
- "fmla v29.4s, v18.4s, v5.s[2]\n"
- "fmla v30.4s, v18.4s, v6.s[2]\n"
- "fmla v31.4s, v18.4s, v7.s[2]\n"
- "fmla v24.4s, v19.4s, v0.s[3]\n"
- "fmla v25.4s, v19.4s, v1.s[3]\n"
- "fmla v26.4s, v19.4s, v2.s[3]\n"
- "fmla v27.4s, v19.4s, v3.s[3]\n"
- "fmla v28.4s, v19.4s, v4.s[3]\n"
- "fmla v29.4s, v19.4s, v5.s[3]\n"
- "fmla v30.4s, v19.4s, v6.s[3]\n"
- "fmla v31.4s, v19.4s, v7.s[3]\n"
- "4:\n"
- "cbz %[blocks], 5f\n"
- "6:\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ldr s0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr s1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- "fmla v24.4s, v16.4s, v0.s[0]\n"
- "ldr s2, [a_ptr2]\n"
- "fmla v25.4s, v16.4s, v1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- "ldr s3, [a_ptr3]\n"
- "fmla v26.4s, v16.4s, v2.s[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- "ldr s4, [a_ptr4]\n"
- "fmla v27.4s, v16.4s, v3.s[0]\n"
- "add a_ptr4, a_ptr4, #0x4\n"
- "ldr s5, [a_ptr5]\n"
- "fmla v28.4s, v16.4s, v4.s[0]\n"
- "add a_ptr5, a_ptr5, #0x4\n"
- "ldr s6, [a_ptr6]\n"
- "fmla v29.4s, v16.4s, v5.s[0]\n"
- "add a_ptr6, a_ptr6, #0x4\n"
- "ldr s7, [a_ptr7]\n"
- "fmla v30.4s, v16.4s, v6.s[0]\n"
- "add a_ptr7, a_ptr7, #0x4\n"
- "fmla v31.4s, v16.4s, v7.s[0]\n"
- "b.ne 6b\n"
- "5:\n"
- "ld1r {v22.4s}, [%[minptr]]\n"
- "ld1r {v23.4s}, [%[maxptr]]\n"
- "fmax v24.4s, v24.4s, v22.4s\n"
- "fmax v25.4s, v25.4s, v22.4s\n"
- "fmax v26.4s, v26.4s, v22.4s\n"
- "fmax v27.4s, v27.4s, v22.4s\n"
- "fmin v24.4s, v24.4s, v23.4s\n"
- "fmin v25.4s, v25.4s, v23.4s\n"
- "fmin v26.4s, v26.4s, v23.4s\n"
- "fmin v27.4s, v27.4s, v23.4s\n"
- "str q24, [%[c_ptr0]]\n"
- "fmax v28.4s, v28.4s, v22.4s\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
- "fmax v29.4s, v29.4s, v22.4s\n"
- "str q25, [c_ptr1]\n"
- "fmax v30.4s, v30.4s, v22.4s\n"
- "fmin v28.4s, v28.4s, v23.4s\n"
- "fmax v31.4s, v31.4s, v22.4s\n"
- "str q26, [c_ptr2]\n"
- "fmin v29.4s, v29.4s, v23.4s\n"
- "fmin v30.4s, v30.4s, v23.4s\n"
- "fmin v31.4s, v31.4s, v23.4s\n"
- "str q27, [c_ptr3]\n"
- "str q28, [c_ptr4]\n"
- "str q29, [c_ptr5]\n"
- "str q30, [c_ptr6]\n"
- "str q31, [c_ptr7]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq a_ptr7\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- ".unreq c_ptr7\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 8); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 4 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
index 4147ab60dc..e0c61e4113 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,44 +10,49 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
#include "../performance_parameters.hpp"
#include "../std_transforms_fixed.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_fp32_mla_16x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_a55(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-void a64_hybrid_fp32_mla_16x4_x1(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_hybrid_fp32_mla_6x16( ARGLIST );
-class hybrid_fp32_mla_16x4
+class cls_a64_hybrid_fp32_mla_6x16
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
{
- return 4;
+ return 6;
}
static unsigned int out_width()
@@ -65,47 +70,33 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
static PerformanceParameters get_performance_parameters(const CPUInfo *ci) {
switch (ci->get_cpu_model()) {
case CPUModel::A55r1:
- return { 2.866 };
+ return { 2.00 };
case CPUModel::A53:
- return { 1.419 };
+ return { 1.43 };
case CPUModel::A73:
- return { 2.551 };
+ return { 2.56 };
default:
- return { 6.25 };
+ return { 6.26 };
}
}
- StdTransformsFixed<operand_type, result_type, 4, 16, 1> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 6, 16, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_fp32_mla_16x4;
+ kern_type kernel=a64_hybrid_fp32_mla_6x16;
- hybrid_fp32_mla_16x4(const CPUInfo *ci)
+ cls_a64_hybrid_fp32_mla_6x16(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_hybrid_fp32_mla_16x4_a55;
- } else if (ci->get_cpu_model() == CPUModel::X1) {
- kernel = a64_hybrid_fp32_mla_16x4_x1;
- }
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
new file mode 100644
index 0000000000..884e8986c8
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_6x16/generic.cpp
@@ -0,0 +1,3430 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 171f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 137f\n"
+ "beq 103f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 69f\n"
+ "beq 35f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x14, 4f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "b 15f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 14f\n"
+ "cmp x16, #0x10\n"
+ "bge 13f\n"
+ "tbz x16, #3, 8f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 6f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 5f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 12f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 12f\n"
+ "6:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x16, #1, 7f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 12f\n"
+ "7:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 12f\n"
+ "8:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x16, #2, 10f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 9f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 12f\n"
+ "9:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 12f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 12f\n"
+ "10:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x16, #1, 11f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 12f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 12f\n"
+ "11:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "12:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 15f\n"
+ "13:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 15f\n"
+ "14:" // Height 1: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "15:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "16:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 17f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 18f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 18f\n"
+ "17:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "18:" // Height 1: input setup done
+ "cmp x11, #0x4\n"
+ "blt 21f\n"
+ "cmp x11, #0x8\n"
+ "blt 20f\n"
+ "19:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "cmp x11, #0x8\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "bge 19b\n"
+ "20:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "21:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 23f\n"
+ "22:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "add x15, x15, #0x40\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "cbnz x11, 22b\n"
+ "23:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 16b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 24f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "24:" // Height 1: No activation
+ "cmp x16, #0x10\n"
+ "bge 33f\n"
+ "tbz x16, #3, 28f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x16, #2, 26f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 25f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 32f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 32f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 32f\n"
+ "26:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 27f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 32f\n"
+ "27:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 32f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 32f\n"
+ "28:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 30f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x16, #1, 29f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 32f\n"
+ "29:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 32f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 32f\n"
+ "30:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 31f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x16, #0, 32f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 32f\n"
+ "31:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "32:" // Height 1: Partial direct writeback: Done
+ "b 34f\n"
+ "33:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "34:" // Height 1: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 3b\n"
+ "b 206f\n"
+ "35:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 36f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 37f\n"
+ "36:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "37:" // Height 2: Column loop
+ "cbz x14, 38f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v13.16b, v9.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v14.16b, v10.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v15.16b, v11.16b\n"
+ "b 49f\n"
+ "38:" // Height 2: no bias
+ "tbz %x[flags], #0, 48f\n"
+ "cmp x16, #0x10\n"
+ "bge 47f\n"
+ "tbz x16, #3, 42f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 40f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 39f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 46f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 46f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 46f\n"
+ "40:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x16, #1, 41f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 46f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 46f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 46f\n"
+ "42:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x16, #2, 44f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 43f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 46f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 46f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 46f\n"
+ "44:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x16, #1, 45f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 46f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 46f\n"
+ "45:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "46:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 49f\n"
+ "47:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 49f\n"
+ "48:" // Height 2: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "49:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "50:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 52f\n"
+ "51:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "52:" // Height 2: input setup done
+ "cmp x11, #0x4\n"
+ "blt 55f\n"
+ "cmp x11, #0x8\n"
+ "blt 54f\n"
+ "53:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "cmp x11, #0x8\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "bge 53b\n"
+ "54:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "55:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 57f\n"
+ "56:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "cbnz x11, 56b\n"
+ "57:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 58f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "58:" // Height 2: No activation
+ "cmp x16, #0x10\n"
+ "bge 67f\n"
+ "tbz x16, #3, 62f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x16, #2, 60f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 59f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 66f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 66f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 66f\n"
+ "60:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 61f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 66f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 66f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 66f\n"
+ "62:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 64f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x16, #1, 63f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 66f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 66f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 66f\n"
+ "64:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 65f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x16, #0, 66f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 66f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "66:" // Height 2: Partial direct writeback: Done
+ "b 68f\n"
+ "67:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "68:" // Height 2: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 37b\n"
+ "b 206f\n"
+ "69:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 70f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 71f\n"
+ "70:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "71:" // Height 3: Column loop
+ "cbz x14, 72f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v13.16b, v9.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "b 83f\n"
+ "72:" // Height 3: no bias
+ "tbz %x[flags], #0, 82f\n"
+ "cmp x16, #0x10\n"
+ "bge 81f\n"
+ "tbz x16, #3, 76f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 74f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 73f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 80f\n"
+ "73:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 80f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 80f\n"
+ "74:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x16, #1, 75f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 80f\n"
+ "75:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 80f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 80f\n"
+ "76:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x16, #2, 78f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 77f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 80f\n"
+ "77:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 80f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 80f\n"
+ "78:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x16, #1, 79f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 80f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 80f\n"
+ "79:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "80:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 83f\n"
+ "81:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 83f\n"
+ "82:" // Height 3: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "83:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "84:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 86f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 86f\n"
+ "85:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "86:" // Height 3: input setup done
+ "cmp x11, #0x4\n"
+ "blt 89f\n"
+ "cmp x11, #0x8\n"
+ "blt 88f\n"
+ "87:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "bge 87b\n"
+ "88:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "89:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 91f\n"
+ "90:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "cbnz x11, 90b\n"
+ "91:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 84b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 92f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "92:" // Height 3: No activation
+ "cmp x16, #0x10\n"
+ "bge 101f\n"
+ "tbz x16, #3, 96f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x16, #2, 94f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 93f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 100f\n"
+ "93:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 100f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 100f\n"
+ "94:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 95f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 100f\n"
+ "95:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 100f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 100f\n"
+ "96:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 98f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x16, #1, 97f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 100f\n"
+ "97:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 100f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 100f\n"
+ "98:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 99f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x16, #0, 100f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 100f\n"
+ "99:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "100:" // Height 3: Partial direct writeback: Done
+ "b 102f\n"
+ "101:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "102:" // Height 3: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 71b\n"
+ "b 206f\n"
+ "103:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 104f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 105f\n"
+ "104:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "105:" // Height 4: Column loop
+ "cbz x14, 106f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "b 117f\n"
+ "106:" // Height 4: no bias
+ "tbz %x[flags], #0, 116f\n"
+ "cmp x16, #0x10\n"
+ "bge 115f\n"
+ "tbz x16, #3, 110f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 108f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 107f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 114f\n"
+ "107:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 114f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 114f\n"
+ "108:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x16, #1, 109f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 114f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 114f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 114f\n"
+ "110:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x16, #2, 112f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 111f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 114f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 114f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 114f\n"
+ "112:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x16, #1, 113f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 114f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 114f\n"
+ "113:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "114:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 117f\n"
+ "115:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 117f\n"
+ "116:" // Height 4: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "117:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "118:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 119f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 120f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 120f\n"
+ "119:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "120:" // Height 4: input setup done
+ "cmp x11, #0x4\n"
+ "blt 123f\n"
+ "cmp x11, #0x8\n"
+ "blt 122f\n"
+ "121:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "bge 121b\n"
+ "122:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "123:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 125f\n"
+ "124:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "cbnz x11, 124b\n"
+ "125:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 118b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 126f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "126:" // Height 4: No activation
+ "cmp x16, #0x10\n"
+ "bge 135f\n"
+ "tbz x16, #3, 130f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x16, #2, 128f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 127f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 134f\n"
+ "127:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 134f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 134f\n"
+ "128:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 129f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 134f\n"
+ "129:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 134f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 134f\n"
+ "130:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 132f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x16, #1, 131f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 134f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 134f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 134f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 133f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x16, #0, 134f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 134f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "134:" // Height 4: Partial direct writeback: Done
+ "b 136f\n"
+ "135:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "136:" // Height 4: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 105b\n"
+ "b 206f\n"
+ "137:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 138f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 139f\n"
+ "138:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "139:" // Height 5: Column loop
+ "cbz x14, 140f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "b 151f\n"
+ "140:" // Height 5: no bias
+ "tbz %x[flags], #0, 150f\n"
+ "cmp x16, #0x10\n"
+ "bge 149f\n"
+ "tbz x16, #3, 144f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 142f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 141f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 148f\n"
+ "141:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 148f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 148f\n"
+ "142:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x16, #1, 143f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 148f\n"
+ "143:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 148f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 148f\n"
+ "144:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x16, #2, 146f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 145f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 148f\n"
+ "145:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 148f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 148f\n"
+ "146:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x16, #1, 147f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 148f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 148f\n"
+ "147:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "148:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 151f\n"
+ "149:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 151f\n"
+ "150:" // Height 5: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "151:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "152:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 153f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 154f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 154f\n"
+ "153:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "154:" // Height 5: input setup done
+ "cmp x11, #0x4\n"
+ "blt 157f\n"
+ "cmp x11, #0x8\n"
+ "blt 156f\n"
+ "155:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "bge 155b\n"
+ "156:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "157:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 159f\n"
+ "158:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "cbnz x11, 158b\n"
+ "159:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 152b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 160f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "160:" // Height 5: No activation
+ "cmp x16, #0x10\n"
+ "bge 169f\n"
+ "tbz x16, #3, 164f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x16, #2, 162f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 161f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 168f\n"
+ "161:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 168f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 168f\n"
+ "162:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 163f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 168f\n"
+ "163:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 168f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 168f\n"
+ "164:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 166f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x16, #1, 165f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 168f\n"
+ "165:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 168f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 168f\n"
+ "166:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 167f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x16, #0, 168f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 168f\n"
+ "167:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "168:" // Height 5: Partial direct writeback: Done
+ "b 170f\n"
+ "169:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "170:" // Height 5: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 139b\n"
+ "b 206f\n"
+ "171:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 172f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 173f\n"
+ "172:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "173:" // Height 6: Column loop
+ "cbz x14, 174f\n"
+ "ldr q8, [x14, #0x0]\n"
+ "mov v12.16b, v8.16b\n"
+ "ldr q9, [x14, #0x10]\n"
+ "mov v16.16b, v8.16b\n"
+ "ldr q10, [x14, #0x20]\n"
+ "mov v20.16b, v8.16b\n"
+ "ldr q11, [x14, #0x30]\n"
+ "mov v24.16b, v8.16b\n"
+ "add x14, x14, #0x40\n"
+ "mov v28.16b, v8.16b\n"
+ "mov v13.16b, v9.16b\n"
+ "mov v17.16b, v9.16b\n"
+ "mov v14.16b, v10.16b\n"
+ "mov v15.16b, v11.16b\n"
+ "mov v18.16b, v10.16b\n"
+ "mov v19.16b, v11.16b\n"
+ "mov v21.16b, v9.16b\n"
+ "mov v22.16b, v10.16b\n"
+ "mov v23.16b, v11.16b\n"
+ "mov v25.16b, v9.16b\n"
+ "mov v26.16b, v10.16b\n"
+ "mov v27.16b, v11.16b\n"
+ "mov v29.16b, v9.16b\n"
+ "mov v30.16b, v10.16b\n"
+ "mov v31.16b, v11.16b\n"
+ "b 185f\n"
+ "174:" // Height 6: no bias
+ "tbz %x[flags], #0, 184f\n"
+ "cmp x16, #0x10\n"
+ "bge 183f\n"
+ "tbz x16, #3, 178f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 176f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 175f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 182f\n"
+ "175:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x16, #0, 182f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 182f\n"
+ "176:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x16, #1, 177f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 182f\n"
+ "177:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x16, #0, 182f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 182f\n"
+ "178:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x16, #2, 180f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 179f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 182f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x16, #0, 182f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 182f\n"
+ "180:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x16, #1, 181f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x16, #0, 182f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 182f\n"
+ "181:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "182:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 185f\n"
+ "183:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 185f\n"
+ "184:" // Height 6: no accumulate
+ "movi v8.16b, #0x0\n"
+ "movi v9.16b, #0x0\n"
+ "movi v10.16b, #0x0\n"
+ "movi v11.16b, #0x0\n"
+ "movi v12.16b, #0x0\n"
+ "movi v13.16b, #0x0\n"
+ "movi v14.16b, #0x0\n"
+ "movi v15.16b, #0x0\n"
+ "movi v16.16b, #0x0\n"
+ "movi v17.16b, #0x0\n"
+ "movi v18.16b, #0x0\n"
+ "movi v19.16b, #0x0\n"
+ "movi v20.16b, #0x0\n"
+ "movi v21.16b, #0x0\n"
+ "movi v22.16b, #0x0\n"
+ "movi v23.16b, #0x0\n"
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "185:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "186:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 187f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 188f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 188f\n"
+ "187:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "188:" // Height 6: input setup done
+ "cmp x11, #0x4\n"
+ "blt 191f\n"
+ "cmp x11, #0x8\n"
+ "blt 190f\n"
+ "189:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x4\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "cmp x11, #0x8\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "fmla v28.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "fmla v30.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "fmla v31.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "fmla v28.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "fmla v29.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "fmla v31.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "fmla v29.4s, v7.4s, v5.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v30.4s, v6.4s, v5.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "bge 189b\n"
+ "190:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x4\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x15, #0x40]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x50]\n"
+ "fmla v8.4s, v6.4s, v0.s[1]\n"
+ "fmla v12.4s, v6.4s, v1.s[1]\n"
+ "fmla v16.4s, v6.4s, v2.s[1]\n"
+ "fmla v20.4s, v6.4s, v3.s[1]\n"
+ "fmla v24.4s, v6.4s, v4.s[1]\n"
+ "fmla v28.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x60]\n"
+ "fmla v9.4s, v7.4s, v0.s[1]\n"
+ "fmla v13.4s, v7.4s, v1.s[1]\n"
+ "fmla v17.4s, v7.4s, v2.s[1]\n"
+ "fmla v21.4s, v7.4s, v3.s[1]\n"
+ "fmla v25.4s, v7.4s, v4.s[1]\n"
+ "fmla v29.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x70]\n"
+ "fmla v10.4s, v6.4s, v0.s[1]\n"
+ "fmla v14.4s, v6.4s, v1.s[1]\n"
+ "fmla v18.4s, v6.4s, v2.s[1]\n"
+ "fmla v22.4s, v6.4s, v3.s[1]\n"
+ "fmla v26.4s, v6.4s, v4.s[1]\n"
+ "fmla v30.4s, v6.4s, v5.s[1]\n"
+ "ldr q6, [x15, #0x80]\n"
+ "fmla v11.4s, v7.4s, v0.s[1]\n"
+ "fmla v15.4s, v7.4s, v1.s[1]\n"
+ "fmla v19.4s, v7.4s, v2.s[1]\n"
+ "fmla v23.4s, v7.4s, v3.s[1]\n"
+ "fmla v27.4s, v7.4s, v4.s[1]\n"
+ "fmla v31.4s, v7.4s, v5.s[1]\n"
+ "ldr q7, [x15, #0x90]\n"
+ "fmla v8.4s, v6.4s, v0.s[2]\n"
+ "fmla v12.4s, v6.4s, v1.s[2]\n"
+ "fmla v16.4s, v6.4s, v2.s[2]\n"
+ "fmla v20.4s, v6.4s, v3.s[2]\n"
+ "fmla v24.4s, v6.4s, v4.s[2]\n"
+ "fmla v28.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xa0]\n"
+ "fmla v9.4s, v7.4s, v0.s[2]\n"
+ "fmla v13.4s, v7.4s, v1.s[2]\n"
+ "fmla v17.4s, v7.4s, v2.s[2]\n"
+ "fmla v21.4s, v7.4s, v3.s[2]\n"
+ "fmla v25.4s, v7.4s, v4.s[2]\n"
+ "fmla v29.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xb0]\n"
+ "fmla v10.4s, v6.4s, v0.s[2]\n"
+ "fmla v14.4s, v6.4s, v1.s[2]\n"
+ "fmla v18.4s, v6.4s, v2.s[2]\n"
+ "fmla v22.4s, v6.4s, v3.s[2]\n"
+ "fmla v26.4s, v6.4s, v4.s[2]\n"
+ "fmla v30.4s, v6.4s, v5.s[2]\n"
+ "ldr q6, [x15, #0xc0]\n"
+ "fmla v11.4s, v7.4s, v0.s[2]\n"
+ "fmla v15.4s, v7.4s, v1.s[2]\n"
+ "fmla v19.4s, v7.4s, v2.s[2]\n"
+ "fmla v23.4s, v7.4s, v3.s[2]\n"
+ "fmla v27.4s, v7.4s, v4.s[2]\n"
+ "fmla v31.4s, v7.4s, v5.s[2]\n"
+ "ldr q7, [x15, #0xd0]\n"
+ "fmla v8.4s, v6.4s, v0.s[3]\n"
+ "fmla v12.4s, v6.4s, v1.s[3]\n"
+ "fmla v16.4s, v6.4s, v2.s[3]\n"
+ "fmla v20.4s, v6.4s, v3.s[3]\n"
+ "fmla v24.4s, v6.4s, v4.s[3]\n"
+ "fmla v28.4s, v6.4s, v5.s[3]\n"
+ "ldr q6, [x15, #0xe0]\n"
+ "fmla v9.4s, v7.4s, v0.s[3]\n"
+ "fmla v13.4s, v7.4s, v1.s[3]\n"
+ "fmla v17.4s, v7.4s, v2.s[3]\n"
+ "fmla v21.4s, v7.4s, v3.s[3]\n"
+ "fmla v25.4s, v7.4s, v4.s[3]\n"
+ "fmla v29.4s, v7.4s, v5.s[3]\n"
+ "ldr q7, [x15, #0xf0]\n"
+ "fmla v10.4s, v6.4s, v0.s[3]\n"
+ "add x15, x15, #0x100\n"
+ "fmla v14.4s, v6.4s, v1.s[3]\n"
+ "fmla v18.4s, v6.4s, v2.s[3]\n"
+ "fmla v22.4s, v6.4s, v3.s[3]\n"
+ "fmla v26.4s, v6.4s, v4.s[3]\n"
+ "fmla v30.4s, v6.4s, v5.s[3]\n"
+ "fmla v11.4s, v7.4s, v0.s[3]\n"
+ "fmla v15.4s, v7.4s, v1.s[3]\n"
+ "fmla v19.4s, v7.4s, v2.s[3]\n"
+ "fmla v23.4s, v7.4s, v3.s[3]\n"
+ "fmla v27.4s, v7.4s, v4.s[3]\n"
+ "fmla v31.4s, v7.4s, v5.s[3]\n"
+ "191:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 193f\n"
+ "192:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x15, #0x0]\n"
+ "fmla v8.4s, v6.4s, v0.s[0]\n"
+ "ldr q7, [x15, #0x10]\n"
+ "fmla v12.4s, v6.4s, v1.s[0]\n"
+ "sub x11, x11, #0x1\n"
+ "fmla v16.4s, v6.4s, v2.s[0]\n"
+ "fmla v20.4s, v6.4s, v3.s[0]\n"
+ "fmla v24.4s, v6.4s, v4.s[0]\n"
+ "fmla v28.4s, v6.4s, v5.s[0]\n"
+ "ldr q6, [x15, #0x20]\n"
+ "fmla v9.4s, v7.4s, v0.s[0]\n"
+ "fmla v13.4s, v7.4s, v1.s[0]\n"
+ "fmla v17.4s, v7.4s, v2.s[0]\n"
+ "fmla v21.4s, v7.4s, v3.s[0]\n"
+ "fmla v25.4s, v7.4s, v4.s[0]\n"
+ "fmla v29.4s, v7.4s, v5.s[0]\n"
+ "ldr q7, [x15, #0x30]\n"
+ "fmla v10.4s, v6.4s, v0.s[0]\n"
+ "add x15, x15, #0x40\n"
+ "fmla v14.4s, v6.4s, v1.s[0]\n"
+ "fmla v18.4s, v6.4s, v2.s[0]\n"
+ "fmla v22.4s, v6.4s, v3.s[0]\n"
+ "fmla v26.4s, v6.4s, v4.s[0]\n"
+ "fmla v30.4s, v6.4s, v5.s[0]\n"
+ "fmla v11.4s, v7.4s, v0.s[0]\n"
+ "fmla v15.4s, v7.4s, v1.s[0]\n"
+ "fmla v19.4s, v7.4s, v2.s[0]\n"
+ "fmla v23.4s, v7.4s, v3.s[0]\n"
+ "fmla v27.4s, v7.4s, v4.s[0]\n"
+ "fmla v31.4s, v7.4s, v5.s[0]\n"
+ "cbnz x11, 192b\n"
+ "193:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 186b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 194f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "fmin v8.4s, v8.4s, v0.4s\n"
+ "fmin v9.4s, v9.4s, v0.4s\n"
+ "fmin v10.4s, v10.4s, v0.4s\n"
+ "fmin v11.4s, v11.4s, v0.4s\n"
+ "fmax v8.4s, v8.4s, v1.4s\n"
+ "fmax v9.4s, v9.4s, v1.4s\n"
+ "fmax v10.4s, v10.4s, v1.4s\n"
+ "fmax v11.4s, v11.4s, v1.4s\n"
+ "fmin v12.4s, v12.4s, v0.4s\n"
+ "fmin v13.4s, v13.4s, v0.4s\n"
+ "fmin v14.4s, v14.4s, v0.4s\n"
+ "fmax v12.4s, v12.4s, v1.4s\n"
+ "fmax v13.4s, v13.4s, v1.4s\n"
+ "fmax v14.4s, v14.4s, v1.4s\n"
+ "fmin v15.4s, v15.4s, v0.4s\n"
+ "fmin v16.4s, v16.4s, v0.4s\n"
+ "fmin v17.4s, v17.4s, v0.4s\n"
+ "fmax v15.4s, v15.4s, v1.4s\n"
+ "fmax v16.4s, v16.4s, v1.4s\n"
+ "fmax v17.4s, v17.4s, v1.4s\n"
+ "fmin v18.4s, v18.4s, v0.4s\n"
+ "fmin v19.4s, v19.4s, v0.4s\n"
+ "fmin v20.4s, v20.4s, v0.4s\n"
+ "fmax v18.4s, v18.4s, v1.4s\n"
+ "fmax v19.4s, v19.4s, v1.4s\n"
+ "fmax v20.4s, v20.4s, v1.4s\n"
+ "fmin v21.4s, v21.4s, v0.4s\n"
+ "fmin v22.4s, v22.4s, v0.4s\n"
+ "fmin v23.4s, v23.4s, v0.4s\n"
+ "fmax v21.4s, v21.4s, v1.4s\n"
+ "fmax v22.4s, v22.4s, v1.4s\n"
+ "fmax v23.4s, v23.4s, v1.4s\n"
+ "fmin v24.4s, v24.4s, v0.4s\n"
+ "fmin v25.4s, v25.4s, v0.4s\n"
+ "fmin v26.4s, v26.4s, v0.4s\n"
+ "fmax v24.4s, v24.4s, v1.4s\n"
+ "fmax v25.4s, v25.4s, v1.4s\n"
+ "fmax v26.4s, v26.4s, v1.4s\n"
+ "fmin v27.4s, v27.4s, v0.4s\n"
+ "fmin v28.4s, v28.4s, v0.4s\n"
+ "fmin v29.4s, v29.4s, v0.4s\n"
+ "fmax v27.4s, v27.4s, v1.4s\n"
+ "fmax v28.4s, v28.4s, v1.4s\n"
+ "fmax v29.4s, v29.4s, v1.4s\n"
+ "fmin v30.4s, v30.4s, v0.4s\n"
+ "fmin v31.4s, v31.4s, v0.4s\n"
+ "fmax v30.4s, v30.4s, v1.4s\n"
+ "fmax v31.4s, v31.4s, v1.4s\n"
+ "194:" // Height 6: No activation
+ "cmp x16, #0x10\n"
+ "bge 203f\n"
+ "tbz x16, #3, 198f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x16, #2, 196f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 195f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 202f\n"
+ "195:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x16, #0, 202f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 202f\n"
+ "196:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x16, #1, 197f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 202f\n"
+ "197:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x16, #0, 202f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 202f\n"
+ "198:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x16, #2, 200f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x16, #1, 199f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 202f\n"
+ "199:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x16, #0, 202f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 202f\n"
+ "200:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x16, #1, 201f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x16, #0, 202f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 202f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "202:" // Height 6: Partial direct writeback: Done
+ "b 204f\n"
+ "203:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "204:" // Height 6: Writeback done
+ "subs x16, x16, #0x10\n"
+ "bgt 173b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 206f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 205f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "205:" // Update direct input
+ "mov x19, #0x18\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "206:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
new file mode 100644
index 0000000000..043d0643f0
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_fp32_mla_8x4( ARGLIST );
+
+class cls_a64_hybrid_fp32_mla_8x4
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 8;
+ }
+
+ static unsigned int out_width()
+ {
+ return 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_fp32_mla_8x4;
+
+ cls_a64_hybrid_fp32_mla_8x4(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
new file mode 100644
index 0000000000..3ab6cad368
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_fp32_mla_8x4/generic.cpp
@@ -0,0 +1,2195 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_fp32_mla_8x4 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x8\n"
+ "bge 155f\n"
+ "cmp %x[M], #0x6\n"
+ "bgt 133f\n"
+ "beq 111f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 89f\n"
+ "beq 67f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 45f\n"
+ "beq 23f\n"
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "cbz x8, 4f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "add x8, x8, #0x10\n"
+ "b 9f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 8f\n"
+ "cmp x6, #0x4\n"
+ "bge 7f\n"
+ "tbz x6, #1, 5f\n"
+ "ldr d24, [x17], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 6f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "b 6f\n"
+ "5:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "6:" // Height 1: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "b 9f\n"
+ "7:" // Height 1: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "b 9f\n"
+ "8:" // Height 1: no accumulate
+ "movi v24.16b, #0x0\n"
+ "9:" // Height 1: setup done
+ "mov x16, #0x0\n"
+ "10:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 11f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "cbnz x16, 12f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "b 12f\n"
+ "11:" // Height 1: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "12:" // Height 1: input setup done
+ "cmp x15, #0x4\n"
+ "blt 15f\n"
+ "cmp x15, #0x8\n"
+ "blt 14f\n"
+ "13:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "bge 13b\n"
+ "14:" // Height 1: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "15:" // Height 1: Multiply loop: Main loop skip
+ "cbz x15, 17f\n"
+ "16:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "add x7, x7, #0x10\n"
+ "cbnz x15, 16b\n"
+ "17:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 10b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "tbz %x[flags], #1, 18f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "18:" // Height 1: No activation
+ "cmp x6, #0x4\n"
+ "bge 21f\n"
+ "tbz x6, #1, 19f\n"
+ "str d24, [x17], #0x8\n"
+ "tbz x6, #0, 20f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "b 20f\n"
+ "19:" // Height 1: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "20:" // Height 1: Partial direct writeback: Done
+ "b 22f\n"
+ "21:" // Height 1: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "22:" // Height 1: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 3b\n"
+ "b 178f\n"
+ "23:" // Height 2
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 24f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 25f\n"
+ "24:" // Height 2: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "25:" // Height 2: Column loop
+ "cbz x8, 26f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "b 31f\n"
+ "26:" // Height 2: no bias
+ "tbz %x[flags], #0, 30f\n"
+ "cmp x6, #0x4\n"
+ "bge 29f\n"
+ "tbz x6, #1, 27f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 28f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "b 28f\n"
+ "27:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "28:" // Height 2: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "b 31f\n"
+ "29:" // Height 2: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "b 31f\n"
+ "30:" // Height 2: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "31:" // Height 2: setup done
+ "mov x16, #0x0\n"
+ "32:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 33f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "cbnz x16, 34f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "b 34f\n"
+ "33:" // Height 2: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "34:" // Height 2: input setup done
+ "cmp x15, #0x4\n"
+ "blt 37f\n"
+ "cmp x15, #0x8\n"
+ "blt 36f\n"
+ "35:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "bge 35b\n"
+ "36:" // Height 2: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "37:" // Height 2: Multiply loop: Main loop skip
+ "cbz x15, 39f\n"
+ "38:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "cbnz x15, 38b\n"
+ "39:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 32b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 40f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "40:" // Height 2: No activation
+ "cmp x6, #0x4\n"
+ "bge 43f\n"
+ "tbz x6, #1, 41f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "tbz x6, #0, 42f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "b 42f\n"
+ "41:" // Height 2: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "42:" // Height 2: Partial direct writeback: Done
+ "b 44f\n"
+ "43:" // Height 2: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "44:" // Height 2: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 25b\n"
+ "b 178f\n"
+ "45:" // Height 3
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 46f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "add x11, x11, x19, LSL #2\n"
+ "b 47f\n"
+ "46:" // Height 3: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "47:" // Height 3: Column loop
+ "cbz x8, 48f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "b 53f\n"
+ "48:" // Height 3: no bias
+ "tbz %x[flags], #0, 52f\n"
+ "cmp x6, #0x4\n"
+ "bge 51f\n"
+ "tbz x6, #1, 49f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 50f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "b 50f\n"
+ "49:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "50:" // Height 3: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "b 53f\n"
+ "51:" // Height 3: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "b 53f\n"
+ "52:" // Height 3: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "53:" // Height 3: setup done
+ "mov x16, #0x0\n"
+ "54:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 55f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "cbnz x16, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 56f\n"
+ "55:" // Height 3: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "56:" // Height 3: input setup done
+ "cmp x15, #0x4\n"
+ "blt 59f\n"
+ "cmp x15, #0x8\n"
+ "blt 58f\n"
+ "57:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "bge 57b\n"
+ "58:" // Height 3: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "add x14, x14, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "59:" // Height 3: Multiply loop: Main loop skip
+ "cbz x15, 61f\n"
+ "60:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "cbnz x15, 60b\n"
+ "61:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 54b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "tbz %x[flags], #1, 62f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "62:" // Height 3: No activation
+ "cmp x6, #0x4\n"
+ "bge 65f\n"
+ "tbz x6, #1, 63f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "tbz x6, #0, 64f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "b 64f\n"
+ "63:" // Height 3: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "64:" // Height 3: Partial direct writeback: Done
+ "b 66f\n"
+ "65:" // Height 3: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "66:" // Height 3: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 47b\n"
+ "b 178f\n"
+ "67:" // Height 4
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 68f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 69f\n"
+ "68:" // Height 4: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "69:" // Height 4: Column loop
+ "cbz x8, 70f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "b 75f\n"
+ "70:" // Height 4: no bias
+ "tbz %x[flags], #0, 74f\n"
+ "cmp x6, #0x4\n"
+ "bge 73f\n"
+ "tbz x6, #1, 71f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 72f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "b 72f\n"
+ "71:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "72:" // Height 4: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "b 75f\n"
+ "73:" // Height 4: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "b 75f\n"
+ "74:" // Height 4: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "75:" // Height 4: setup done
+ "mov x16, #0x0\n"
+ "76:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 77f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "cbnz x16, 78f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 78f\n"
+ "77:" // Height 4: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "78:" // Height 4: input setup done
+ "cmp x15, #0x4\n"
+ "blt 81f\n"
+ "cmp x15, #0x8\n"
+ "blt 80f\n"
+ "79:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "cmp x15, #0x8\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "bge 79b\n"
+ "80:" // Height 4: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "add x12, x12, #0x10\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "81:" // Height 4: Multiply loop: Main loop skip
+ "cbz x15, 83f\n"
+ "82:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "cbnz x15, 82b\n"
+ "83:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 76b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 84f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "84:" // Height 4: No activation
+ "cmp x6, #0x4\n"
+ "bge 87f\n"
+ "tbz x6, #1, 85f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "tbz x6, #0, 86f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "b 86f\n"
+ "85:" // Height 4: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "86:" // Height 4: Partial direct writeback: Done
+ "b 88f\n"
+ "87:" // Height 4: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "88:" // Height 4: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 69b\n"
+ "b 178f\n"
+ "89:" // Height 5
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 90f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 91f\n"
+ "90:" // Height 5: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "91:" // Height 5: Column loop
+ "cbz x8, 92f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "b 97f\n"
+ "92:" // Height 5: no bias
+ "tbz %x[flags], #0, 96f\n"
+ "cmp x6, #0x4\n"
+ "bge 95f\n"
+ "tbz x6, #1, 93f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 94f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "b 94f\n"
+ "93:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "94:" // Height 5: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 97f\n"
+ "95:" // Height 5: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "b 97f\n"
+ "96:" // Height 5: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "97:" // Height 5: setup done
+ "mov x16, #0x0\n"
+ "98:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 99f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "cbnz x16, 100f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 100f\n"
+ "99:" // Height 5: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "100:" // Height 5: input setup done
+ "cmp x15, #0x4\n"
+ "blt 103f\n"
+ "cmp x15, #0x8\n"
+ "blt 102f\n"
+ "101:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "bge 101b\n"
+ "102:" // Height 5: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "103:" // Height 5: Multiply loop: Main loop skip
+ "cbz x15, 105f\n"
+ "104:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "cbnz x15, 104b\n"
+ "105:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 98b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 106f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "106:" // Height 5: No activation
+ "cmp x6, #0x4\n"
+ "bge 109f\n"
+ "tbz x6, #1, 107f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "tbz x6, #0, 108f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "b 108f\n"
+ "107:" // Height 5: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "108:" // Height 5: Partial direct writeback: Done
+ "b 110f\n"
+ "109:" // Height 5: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "110:" // Height 5: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 91b\n"
+ "b 178f\n"
+ "111:" // Height 6
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 112f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 113f\n"
+ "112:" // Height 6: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "113:" // Height 6: Column loop
+ "cbz x8, 114f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "b 119f\n"
+ "114:" // Height 6: no bias
+ "tbz %x[flags], #0, 118f\n"
+ "cmp x6, #0x4\n"
+ "bge 117f\n"
+ "tbz x6, #1, 115f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 116f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v29.s }[2], [x25]\n"
+ "b 116f\n"
+ "115:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "ldr s29, [x25, #0x0]\n"
+ "116:" // Height 6: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 119f\n"
+ "117:" // Height 6: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "ldr q29, [x25, #0x0]\n"
+ "b 119f\n"
+ "118:" // Height 6: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "119:" // Height 6: setup done
+ "mov x16, #0x0\n"
+ "120:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "cbnz x16, 122f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 122f\n"
+ "121:" // Height 6: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "122:" // Height 6: input setup done
+ "cmp x15, #0x4\n"
+ "blt 125f\n"
+ "cmp x15, #0x8\n"
+ "blt 124f\n"
+ "123:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "bge 123b\n"
+ "124:" // Height 6: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v29.4s, v13.4s, v5.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v29.4s, v14.4s, v5.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "fmla v29.4s, v15.4s, v5.s[3]\n"
+ "125:" // Height 6: Multiply loop: Main loop skip
+ "cbz x15, 127f\n"
+ "126:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "fmla v29.4s, v16.4s, v5.s[0]\n"
+ "cbnz x15, 126b\n"
+ "127:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 120b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 128f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "128:" // Height 6: No activation
+ "cmp x6, #0x4\n"
+ "bge 131f\n"
+ "tbz x6, #1, 129f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "str d29, [x25], #0x8\n"
+ "tbz x6, #0, 130f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "b 130f\n"
+ "129:" // Height 6: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "str s29, [x25, #0x0]\n"
+ "130:" // Height 6: Partial direct writeback: Done
+ "b 132f\n"
+ "131:" // Height 6: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "str q29, [x25, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "132:" // Height 6: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 113b\n"
+ "b 178f\n"
+ "133:" // Height 7
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 134f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 135f\n"
+ "134:" // Height 7: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "135:" // Height 7: Column loop
+ "cbz x8, 136f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "b 141f\n"
+ "136:" // Height 7: no bias
+ "tbz %x[flags], #0, 140f\n"
+ "cmp x6, #0x4\n"
+ "bge 139f\n"
+ "tbz x6, #1, 137f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x25], #0x8\n"
+ "ldr d30, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 138f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v29.s }[2], [x25]\n"
+ "ld1 { v30.s }[2], [x23]\n"
+ "b 138f\n"
+ "137:" // Height 7: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "ldr s29, [x25, #0x0]\n"
+ "ldr s30, [x23, #0x0]\n"
+ "138:" // Height 7: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 141f\n"
+ "139:" // Height 7: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "ldr q29, [x25, #0x0]\n"
+ "ldr q30, [x23, #0x0]\n"
+ "b 141f\n"
+ "140:" // Height 7: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "141:" // Height 7: setup done
+ "mov x16, #0x0\n"
+ "142:" // Height 7: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 143f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "cbnz x16, 144f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 144f\n"
+ "143:" // Height 7: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "144:" // Height 7: input setup done
+ "cmp x15, #0x4\n"
+ "blt 147f\n"
+ "cmp x15, #0x8\n"
+ "blt 146f\n"
+ "145:" // Height 7: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "bge 145b\n"
+ "146:" // Height 7: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v12.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v29.4s, v13.4s, v5.s[1]\n"
+ "fmla v30.4s, v13.4s, v6.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v29.4s, v14.4s, v5.s[2]\n"
+ "fmla v30.4s, v14.4s, v6.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "fmla v29.4s, v15.4s, v5.s[3]\n"
+ "fmla v30.4s, v15.4s, v6.s[3]\n"
+ "147:" // Height 7: Multiply loop: Main loop skip
+ "cbz x15, 149f\n"
+ "148:" // Height 7: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s6, [x22], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "fmla v29.4s, v16.4s, v5.s[0]\n"
+ "fmla v30.4s, v16.4s, v6.s[0]\n"
+ "cbnz x15, 148b\n"
+ "149:" // Height 7: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 142b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 150f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "150:" // Height 7: No activation
+ "cmp x6, #0x4\n"
+ "bge 153f\n"
+ "tbz x6, #1, 151f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "str d29, [x25], #0x8\n"
+ "str d30, [x23], #0x8\n"
+ "tbz x6, #0, 152f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "st1 { v30.s }[2], [x23]\n"
+ "b 152f\n"
+ "151:" // Height 7: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "str s29, [x25, #0x0]\n"
+ "str s30, [x23, #0x0]\n"
+ "152:" // Height 7: Partial direct writeback: Done
+ "b 154f\n"
+ "153:" // Height 7: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "str q29, [x25, #0x0]\n"
+ "str q30, [x23, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "154:" // Height 7: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 135b\n"
+ "b 178f\n"
+ "155:" // Height 8
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 156f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "ldr x21, [%x[output_ptr], #0x38]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add %x[output_ptr], %x[output_ptr], #0x40\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 157f\n"
+ "156:" // Height 8: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "157:" // Height 8: Column loop
+ "cbz x8, 158f\n"
+ "ldr q24, [x8, #0x0]\n"
+ "mov v25.16b, v24.16b\n"
+ "add x8, x8, #0x10\n"
+ "mov v26.16b, v24.16b\n"
+ "mov v27.16b, v24.16b\n"
+ "mov v28.16b, v24.16b\n"
+ "mov v29.16b, v24.16b\n"
+ "mov v30.16b, v24.16b\n"
+ "mov v31.16b, v24.16b\n"
+ "b 163f\n"
+ "158:" // Height 8: no bias
+ "tbz %x[flags], #0, 162f\n"
+ "cmp x6, #0x4\n"
+ "bge 161f\n"
+ "tbz x6, #1, 159f\n"
+ "ldr d24, [x17], #0x8\n"
+ "ldr d25, [x13], #0x8\n"
+ "ldr d26, [x11], #0x8\n"
+ "ldr d27, [x9], #0x8\n"
+ "ldr d28, [x27], #0x8\n"
+ "ldr d29, [x25], #0x8\n"
+ "ldr d30, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x6, #0, 160f\n"
+ "ld1 { v24.s }[2], [x17]\n"
+ "ld1 { v25.s }[2], [x13]\n"
+ "ld1 { v26.s }[2], [x11]\n"
+ "ld1 { v27.s }[2], [x9]\n"
+ "ld1 { v28.s }[2], [x27]\n"
+ "ld1 { v29.s }[2], [x25]\n"
+ "ld1 { v30.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 160f\n"
+ "159:" // Height 8: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s24, [x17, #0x0]\n"
+ "ldr s25, [x13, #0x0]\n"
+ "ldr s26, [x11, #0x0]\n"
+ "ldr s27, [x9, #0x0]\n"
+ "ldr s28, [x27, #0x0]\n"
+ "ldr s29, [x25, #0x0]\n"
+ "ldr s30, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "160:" // Height 8: Partial accumulate: Done
+ "sub x17, x17, x19\n"
+ "sub x13, x13, x19\n"
+ "sub x11, x11, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 163f\n"
+ "161:" // Height 8: full accumulate
+ "ldr q24, [x17, #0x0]\n"
+ "ldr q25, [x13, #0x0]\n"
+ "ldr q26, [x11, #0x0]\n"
+ "ldr q27, [x9, #0x0]\n"
+ "ldr q28, [x27, #0x0]\n"
+ "ldr q29, [x25, #0x0]\n"
+ "ldr q30, [x23, #0x0]\n"
+ "ldr q31, [x21, #0x0]\n"
+ "b 163f\n"
+ "162:" // Height 8: no accumulate
+ "movi v24.16b, #0x0\n"
+ "movi v25.16b, #0x0\n"
+ "movi v26.16b, #0x0\n"
+ "movi v27.16b, #0x0\n"
+ "movi v28.16b, #0x0\n"
+ "movi v29.16b, #0x0\n"
+ "movi v30.16b, #0x0\n"
+ "movi v31.16b, #0x0\n"
+ "163:" // Height 8: setup done
+ "mov x16, #0x0\n"
+ "164:" // Height 8: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 165f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x38]\n"
+ "cbnz x16, 166f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 166f\n"
+ "165:" // Height 8: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "166:" // Height 8: input setup done
+ "cmp x15, #0x4\n"
+ "blt 169f\n"
+ "cmp x15, #0x8\n"
+ "blt 168f\n"
+ "167:" // Height 8: Multiply loop: Main loop head
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q8, [x7, #0x0]\n"
+ "fmla v24.4s, v8.4s, v0.s[0]\n"
+ "ldr q9, [x7, #0x10]\n"
+ "fmla v25.4s, v8.4s, v1.s[0]\n"
+ "ldr q10, [x7, #0x20]\n"
+ "fmla v26.4s, v8.4s, v2.s[0]\n"
+ "ldr q11, [x7, #0x30]\n"
+ "fmla v27.4s, v8.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v8.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v8.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v8.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.4s, v8.4s, v7.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v9.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.4s, v9.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v26.4s, v9.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v27.4s, v9.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x15, x15, #0x4\n"
+ "fmla v28.4s, v9.4s, v4.s[1]\n"
+ "cmp x15, #0x8\n"
+ "fmla v29.4s, v9.4s, v5.s[1]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v30.4s, v9.4s, v6.s[1]\n"
+ "fmla v31.4s, v9.4s, v7.s[1]\n"
+ "fmla v24.4s, v10.4s, v0.s[2]\n"
+ "fmla v25.4s, v10.4s, v1.s[2]\n"
+ "fmla v26.4s, v10.4s, v2.s[2]\n"
+ "fmla v27.4s, v10.4s, v3.s[2]\n"
+ "fmla v28.4s, v10.4s, v4.s[2]\n"
+ "fmla v29.4s, v10.4s, v5.s[2]\n"
+ "fmla v30.4s, v10.4s, v6.s[2]\n"
+ "fmla v31.4s, v10.4s, v7.s[2]\n"
+ "fmla v24.4s, v11.4s, v0.s[3]\n"
+ "fmla v25.4s, v11.4s, v1.s[3]\n"
+ "fmla v26.4s, v11.4s, v2.s[3]\n"
+ "fmla v27.4s, v11.4s, v3.s[3]\n"
+ "fmla v28.4s, v11.4s, v4.s[3]\n"
+ "fmla v29.4s, v11.4s, v5.s[3]\n"
+ "fmla v30.4s, v11.4s, v6.s[3]\n"
+ "fmla v31.4s, v11.4s, v7.s[3]\n"
+ "bge 167b\n"
+ "168:" // Height 8: Multiply loop: Single iteration only
+ "sub x15, x15, #0x4\n"
+ "ldr q0, [x14, #0x0]\n"
+ "ldr q1, [x12, #0x0]\n"
+ "ldr q2, [x10, #0x0]\n"
+ "ldr q3, [x28, #0x0]\n"
+ "ldr q4, [x26, #0x0]\n"
+ "ldr q5, [x24, #0x0]\n"
+ "ldr q6, [x22, #0x0]\n"
+ "ldr q7, [x20, #0x0]\n"
+ "ldr q12, [x7, #0x0]\n"
+ "fmla v24.4s, v12.4s, v0.s[0]\n"
+ "ldr q13, [x7, #0x10]\n"
+ "fmla v25.4s, v12.4s, v1.s[0]\n"
+ "ldr q14, [x7, #0x20]\n"
+ "fmla v26.4s, v12.4s, v2.s[0]\n"
+ "ldr q15, [x7, #0x30]\n"
+ "fmla v27.4s, v12.4s, v3.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "fmla v28.4s, v12.4s, v4.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "fmla v29.4s, v12.4s, v5.s[0]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "add x10, x10, #0x10\n"
+ "fmla v30.4s, v12.4s, v6.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla v31.4s, v12.4s, v7.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla v24.4s, v13.4s, v0.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla v25.4s, v13.4s, v1.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla v26.4s, v13.4s, v2.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla v27.4s, v13.4s, v3.s[1]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "add x7, x7, #0x40\n"
+ "fmla v28.4s, v13.4s, v4.s[1]\n"
+ "fmla v29.4s, v13.4s, v5.s[1]\n"
+ "fmla v30.4s, v13.4s, v6.s[1]\n"
+ "fmla v31.4s, v13.4s, v7.s[1]\n"
+ "fmla v24.4s, v14.4s, v0.s[2]\n"
+ "fmla v25.4s, v14.4s, v1.s[2]\n"
+ "fmla v26.4s, v14.4s, v2.s[2]\n"
+ "fmla v27.4s, v14.4s, v3.s[2]\n"
+ "fmla v28.4s, v14.4s, v4.s[2]\n"
+ "fmla v29.4s, v14.4s, v5.s[2]\n"
+ "fmla v30.4s, v14.4s, v6.s[2]\n"
+ "fmla v31.4s, v14.4s, v7.s[2]\n"
+ "fmla v24.4s, v15.4s, v0.s[3]\n"
+ "fmla v25.4s, v15.4s, v1.s[3]\n"
+ "fmla v26.4s, v15.4s, v2.s[3]\n"
+ "fmla v27.4s, v15.4s, v3.s[3]\n"
+ "fmla v28.4s, v15.4s, v4.s[3]\n"
+ "fmla v29.4s, v15.4s, v5.s[3]\n"
+ "fmla v30.4s, v15.4s, v6.s[3]\n"
+ "fmla v31.4s, v15.4s, v7.s[3]\n"
+ "169:" // Height 8: Multiply loop: Main loop skip
+ "cbz x15, 171f\n"
+ "170:" // Height 8: Multiply loop: Odd block loop
+ "ldr s0, [x14], #0x4\n"
+ "ldr s1, [x12], #0x4\n"
+ "ldr s2, [x10], #0x4\n"
+ "ldr s3, [x28], #0x4\n"
+ "ldr s4, [x26], #0x4\n"
+ "ldr s5, [x24], #0x4\n"
+ "ldr s6, [x22], #0x4\n"
+ "ldr s7, [x20], #0x4\n"
+ "ldr q16, [x7, #0x0]\n"
+ "fmla v24.4s, v16.4s, v0.s[0]\n"
+ "sub x15, x15, #0x1\n"
+ "fmla v25.4s, v16.4s, v1.s[0]\n"
+ "add x7, x7, #0x10\n"
+ "fmla v26.4s, v16.4s, v2.s[0]\n"
+ "fmla v27.4s, v16.4s, v3.s[0]\n"
+ "fmla v28.4s, v16.4s, v4.s[0]\n"
+ "fmla v29.4s, v16.4s, v5.s[0]\n"
+ "fmla v30.4s, v16.4s, v6.s[0]\n"
+ "fmla v31.4s, v16.4s, v7.s[0]\n"
+ "cbnz x15, 170b\n"
+ "171:" // Height 8: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x16, x16, #0x1\n"
+ "cmp x16, x19\n"
+ "bne 164b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 172f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1r { v17.4s }, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1r { v16.4s }, [x19]\n"
+ "fmin v24.4s, v24.4s, v16.4s\n"
+ "fmin v25.4s, v25.4s, v16.4s\n"
+ "fmin v26.4s, v26.4s, v16.4s\n"
+ "fmin v27.4s, v27.4s, v16.4s\n"
+ "fmax v24.4s, v24.4s, v17.4s\n"
+ "fmax v25.4s, v25.4s, v17.4s\n"
+ "fmax v26.4s, v26.4s, v17.4s\n"
+ "fmax v27.4s, v27.4s, v17.4s\n"
+ "fmin v28.4s, v28.4s, v16.4s\n"
+ "fmin v29.4s, v29.4s, v16.4s\n"
+ "fmin v30.4s, v30.4s, v16.4s\n"
+ "fmax v28.4s, v28.4s, v17.4s\n"
+ "fmax v29.4s, v29.4s, v17.4s\n"
+ "fmax v30.4s, v30.4s, v17.4s\n"
+ "fmin v31.4s, v31.4s, v16.4s\n"
+ "fmax v31.4s, v31.4s, v17.4s\n"
+ "172:" // Height 8: No activation
+ "cmp x6, #0x4\n"
+ "bge 175f\n"
+ "tbz x6, #1, 173f\n"
+ "str d24, [x17], #0x8\n"
+ "str d25, [x13], #0x8\n"
+ "str d26, [x11], #0x8\n"
+ "str d27, [x9], #0x8\n"
+ "str d28, [x27], #0x8\n"
+ "str d29, [x25], #0x8\n"
+ "str d30, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x6, #0, 174f\n"
+ "st1 { v24.s }[2], [x17]\n"
+ "st1 { v25.s }[2], [x13]\n"
+ "st1 { v26.s }[2], [x11]\n"
+ "st1 { v27.s }[2], [x9]\n"
+ "st1 { v28.s }[2], [x27]\n"
+ "st1 { v29.s }[2], [x25]\n"
+ "st1 { v30.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 174f\n"
+ "173:" // Height 8: Partial direct writeback: partial_1_0
+ "str s24, [x17, #0x0]\n"
+ "str s25, [x13, #0x0]\n"
+ "str s26, [x11, #0x0]\n"
+ "str s27, [x9, #0x0]\n"
+ "str s28, [x27, #0x0]\n"
+ "str s29, [x25, #0x0]\n"
+ "str s30, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "174:" // Height 8: Partial direct writeback: Done
+ "b 176f\n"
+ "175:" // Height 8: Full writeback
+ "str q24, [x17, #0x0]\n"
+ "str q25, [x13, #0x0]\n"
+ "str q26, [x11, #0x0]\n"
+ "str q27, [x9, #0x0]\n"
+ "str q28, [x27, #0x0]\n"
+ "str q29, [x25, #0x0]\n"
+ "str q30, [x23, #0x0]\n"
+ "str q31, [x21, #0x0]\n"
+ "add x17, x17, #0x10\n"
+ "add x13, x13, #0x10\n"
+ "add x11, x11, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "176:" // Height 8: Writeback done
+ "subs x6, x6, #0x4\n"
+ "bgt 157b\n"
+ "subs %x[M], %x[M], #0x8\n"
+ "beq 178f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 177f\n"
+ "add x20, x20, #0x8\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "177:" // Update direct input
+ "mov x19, #0x20\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "178:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
index a23101a7ce..4bb7a1e0eb 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,38 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
-#include <cstdint>
#include "../std_transforms_fixed.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_s8s32_dot_16x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_hybrid_s8qa_dot_4x16( ARGLIST );
-class hybrid_s8s32_dot_16x4
+class cls_a64_hybrid_s8qa_dot_4x16
{
public:
typedef int8_t operand_type;
- typedef int32_t result_type;
+ typedef int8_t result_type;
- typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -61,32 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_s8s32_dot_16x4;
+ kern_type kernel=a64_hybrid_s8qa_dot_4x16;
- hybrid_s8s32_dot_16x4(const CPUInfo *ci)
+ cls_a64_hybrid_s8qa_dot_4x16(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_hybrid_s8s32_dot_16x4_a55;
- }
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..3fb365bc1e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qa_dot_4x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 94f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 63f\n"
+ "beq 32f\n"
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "blt 12f\n"
+ "cmp x27, #0x20\n"
+ "blt 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q5, [x11, #0x80]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 11f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "11:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "12:" // Height 1: Multiply loop: Main loop skip
+ "cbz x27, 19f\n"
+ "cmp x27, #0x4\n"
+ "blt 15f\n"
+ "13:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "tbnz %x[flags], #31, 14f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "14:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ "bge 13b\n"
+ "cbz x27, 19f\n"
+ "15:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 16f\n"
+ "ldr h0, [x26], #0x2\n"
+ "tbz x27, #0, 17f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "b 17f\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "17:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 18f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ "18:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ "19:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 20f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "neg v1.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v1.4s\n"
+ "20:" // Height 1: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add x10, x10, #0x40\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "tbz %x[flags], #5, 21f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "21:" // Height 1: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 30f\n"
+ "tbz x12, #3, 25f\n"
+ "str d16, [x9], #0x8\n"
+ "tbz x12, #2, 23f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "tbz x12, #1, 22f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "b 29f\n"
+ "22:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "b 29f\n"
+ "23:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 24f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "b 29f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "b 29f\n"
+ "25:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 27f\n"
+ "str s16, [x9], #0x4\n"
+ "tbz x12, #1, 26f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "b 29f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "b 29f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 28f\n"
+ "str h16, [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "b 29f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "29:" // Height 1: Partial direct writeback: Done
+ "b 31f\n"
+ "30:" // Height 1: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "31:" // Height 1: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 3b\n"
+ "b 126f\n"
+ "32:" // Height 2
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 33f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 34f\n"
+ "33:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "34:" // Height 2: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "35:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "36:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 37f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 38f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "38:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "blt 43f\n"
+ "cmp x27, #0x20\n"
+ "blt 41f\n"
+ "39:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 40f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "40:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "bge 39b\n"
+ "41:" // Height 2: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 42f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "42:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "43:" // Height 2: Multiply loop: Main loop skip
+ "cbz x27, 50f\n"
+ "cmp x27, #0x4\n"
+ "blt 46f\n"
+ "44:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "tbnz %x[flags], #31, 45f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "45:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ "bge 44b\n"
+ "cbz x27, 50f\n"
+ "46:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 47f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "tbz x27, #0, 48f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "b 48f\n"
+ "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "48:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 49f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ "49:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
+ "50:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 36b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 51f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "neg v2.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v2.4s\n"
+ "mul v12.4s, v12.4s, v2.4s\n"
+ "51:" // Height 2: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "tbz %x[flags], #5, 52f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "52:" // Height 2: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 61f\n"
+ "tbz x12, #3, 56f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x12, #2, 54f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x12, #1, 53f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "b 60f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "b 60f\n"
+ "54:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 55f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "b 60f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "b 60f\n"
+ "56:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 58f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x12, #1, 57f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "b 60f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "b 60f\n"
+ "58:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 59f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "b 60f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "60:" // Height 2: Partial direct writeback: Done
+ "b 62f\n"
+ "61:" // Height 2: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "62:" // Height 2: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 34b\n"
+ "b 126f\n"
+ "63:" // Height 3
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 64f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 65f\n"
+ "64:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "65:" // Height 3: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "66:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "67:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 68f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 69f\n"
+ "68:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "69:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "blt 74f\n"
+ "cmp x27, #0x20\n"
+ "blt 72f\n"
+ "70:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 71f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "71:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bge 70b\n"
+ "72:" // Height 3: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "74:" // Height 3: Multiply loop: Main loop skip
+ "cbz x27, 81f\n"
+ "cmp x27, #0x4\n"
+ "blt 77f\n"
+ "75:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 76f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "76:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ "bge 75b\n"
+ "cbz x27, 81f\n"
+ "77:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 78f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "tbz x27, #0, 79f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "b 79f\n"
+ "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "79:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 80f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ "80:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n"
+ "81:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 67b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 82f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "neg v3.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v3.4s\n"
+ "mul v12.4s, v12.4s, v3.4s\n"
+ "mul v13.4s, v13.4s, v3.4s\n"
+ "82:" // Height 3: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "tbz %x[flags], #5, 83f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "83:" // Height 3: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 92f\n"
+ "tbz x12, #3, 87f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x12, #2, 85f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x12, #1, 84f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "b 91f\n"
+ "84:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "b 91f\n"
+ "85:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 86f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "b 91f\n"
+ "86:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "b 91f\n"
+ "87:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 89f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x12, #1, 88f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "b 91f\n"
+ "88:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "b 91f\n"
+ "89:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 90f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "b 91f\n"
+ "90:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "91:" // Height 3: Partial direct writeback: Done
+ "b 93f\n"
+ "92:" // Height 3: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "93:" // Height 3: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 65b\n"
+ "b 126f\n"
+ "94:" // Height 4
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 95f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "96:" // Height 4: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "97:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "98:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 99f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 100f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 100f\n"
+ "99:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "100:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "blt 105f\n"
+ "cmp x27, #0x20\n"
+ "blt 103f\n"
+ "101:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x4f80e090 // sdot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x4f81e094 // sdot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x4f82e098 // sdot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x4f83e09c // sdot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x4f80e0b1 // sdot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x4f81e0b5 // sdot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x4f82e0b9 // sdot v25.4s, v5.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f83e0bd // sdot v29.4s, v5.16b, v3.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x4f80e0d2 // sdot v18.4s, v6.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e0d6 // sdot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0da // sdot v26.4s, v6.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f83e0de // sdot v30.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x4f80e0f3 // sdot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f7 // sdot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fb // sdot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0ff // sdot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x4fa0e110 // sdot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e114 // sdot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e118 // sdot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e11c // sdot v28.4s, v8.16b, v3.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x4fa0e131 // sdot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e135 // sdot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e139 // sdot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e13d // sdot v29.4s, v9.16b, v3.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x4fa0e152 // sdot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e156 // sdot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e15a // sdot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e15e // sdot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x4fa0e093 // sdot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e097 // sdot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e09b // sdot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e09f // sdot v31.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x4f80e8b0 // sdot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8b4 // sdot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8b8 // sdot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8bc // sdot v28.4s, v5.16b, v3.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x4f80e8d1 // sdot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e8d5 // sdot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d9 // sdot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8dd // sdot v29.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f80e8f2 // sdot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f6 // sdot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8fa // sdot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8fe // sdot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f80e913 // sdot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x4f81e917 // sdot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e91b // sdot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e91f // sdot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e930 // sdot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e934 // sdot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e938 // sdot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e93c // sdot v28.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e951 // sdot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e955 // sdot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e959 // sdot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e95d // sdot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e892 // sdot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e896 // sdot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e89a // sdot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e89e // sdot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8b3 // sdot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b7 // sdot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8bb // sdot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8bf // sdot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 102f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "102:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bge 101b\n"
+ "103:" // Height 4: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x4f80e0d0 // sdot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x4f81e0d4 // sdot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x4f82e0d8 // sdot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x4f83e0dc // sdot v28.4s, v6.16b, v3.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x4f80e0f1 // sdot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x4f81e0f5 // sdot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x4f82e0f9 // sdot v25.4s, v7.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f83e0fd // sdot v29.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x4f80e112 // sdot v18.4s, v8.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f81e116 // sdot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e11a // sdot v26.4s, v8.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f83e11e // sdot v30.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x4f80e133 // sdot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x4f81e137 // sdot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x4f82e13b // sdot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f83e13f // sdot v31.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x4fa0e150 // sdot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e154 // sdot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e158 // sdot v24.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e15c // sdot v28.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x4fa0e091 // sdot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e095 // sdot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e099 // sdot v25.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e09d // sdot v29.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x4fa0e0b2 // sdot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0b6 // sdot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0ba // sdot v26.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0be // sdot v30.4s, v5.16b, v3.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x4fa0e0d3 // sdot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0d7 // sdot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0db // sdot v27.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0df // sdot v31.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x4f80e8f0 // sdot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8f4 // sdot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f8 // sdot v24.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8fc // sdot v28.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x4f80e911 // sdot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x4f81e915 // sdot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x4f82e919 // sdot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x4f83e91d // sdot v29.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x4f80e932 // sdot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x4f81e936 // sdot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x4f82e93a // sdot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x4f83e93e // sdot v30.4s, v9.16b, v3.4b[2]\n"
+ ".inst 0x4f80e953 // sdot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x4f81e957 // sdot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x4f82e95b // sdot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x4f83e95f // sdot v31.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x4fa0e890 // sdot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e894 // sdot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e898 // sdot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e89c // sdot v28.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8b1 // sdot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8b5 // sdot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8b9 // sdot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8bd // sdot v29.4s, v5.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8d2 // sdot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8d6 // sdot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8da // sdot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8de // sdot v30.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8f3 // sdot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8f7 // sdot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8fb // sdot v27.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8ff // sdot v31.4s, v7.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 104f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "104:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "105:" // Height 4: Multiply loop: Main loop skip
+ "cbz x27, 112f\n"
+ "cmp x27, #0x4\n"
+ "blt 108f\n"
+ "106:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 107f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "107:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
+ ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x4f80e093 // sdot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x4f81e097 // sdot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x4f82e09b // sdot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x4f83e09f // sdot v31.4s, v4.16b, v3.4b[0]\n"
+ "bge 106b\n"
+ "cbz x27, 112f\n"
+ "108:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 109f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "ldr h3, [x20], #0x2\n"
+ "tbz x27, #0, 110f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 110f\n"
+ "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "ldr b3, [x20, #0x0]\n"
+ "110:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 111f\n"
+ ".inst 0x4e8f940b // sdot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x4e8f942c // sdot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x4e8f944d // sdot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x4e8f946e // sdot v14.4s, v3.16b, v15.16b\n"
+ "111:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x4f80e0b0 // sdot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x4f81e0b4 // sdot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x4f82e0b8 // sdot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x4f83e0bc // sdot v28.4s, v5.16b, v3.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x4f80e0d1 // sdot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0d5 // sdot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d9 // sdot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0dd // sdot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0f2 // sdot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0f6 // sdot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0fa // sdot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0fe // sdot v30.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f80e113 // sdot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x4f81e117 // sdot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x4f82e11b // sdot v27.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x4f83e11f // sdot v31.4s, v8.16b, v3.4b[0]\n"
+ "112:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 98b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 113f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "mul v12.4s, v12.4s, v4.4s\n"
+ "mul v13.4s, v13.4s, v4.4s\n"
+ "mul v14.4s, v14.4s, v4.4s\n"
+ "113:" // Height 4: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "tbz %x[flags], #5, 114f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "and v9.16b, v28.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v10.16b, v29.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v4.16b, v30.16b, v0.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "and v5.16b, v31.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v9.4s\n"
+ "sqadd v29.4s, v29.4s, v10.4s\n"
+ "sqadd v30.4s, v30.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v5.4s\n"
+ "114:" // Height 4: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "srshl v29.4s, v29.4s, v0.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "srshl v31.4s, v31.4s, v0.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 123f\n"
+ "tbz x12, #3, 118f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x12, #2, 116f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x12, #1, 115f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "st1 { v28.b }[14], [x21]\n"
+ "b 122f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "st1 { v28.b }[12], [x21]\n"
+ "b 122f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 117f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "st1 { v28.b }[10], [x21]\n"
+ "b 122f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "st1 { v28.b }[8], [x21]\n"
+ "b 122f\n"
+ "118:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 120f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x12, #1, 119f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "st1 { v28.b }[6], [x21]\n"
+ "b 122f\n"
+ "119:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "st1 { v28.b }[4], [x21]\n"
+ "b 122f\n"
+ "120:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 121f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x21]\n"
+ "b 122f\n"
+ "121:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
+ "122:" // Height 4: Partial direct writeback: Done
+ "b 124f\n"
+ "123:" // Height 4: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "124:" // Height 4: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 96b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 126f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 125f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "125:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "126:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
new file mode 100644
index 0000000000..6d4f3b2efe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8qs_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8qs_dot_6x16
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_s8qs_dot_6x16;
+
+ cls_a64_hybrid_s8qs_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..0e98ab8347
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8qs_dot_6x16/generic.cpp
@@ -0,0 +1,3613 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8qs_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+ struct KernelArgs {
+ const int32_t *multiplier_ptr = {};
+ const int32_t *shift_ptr = {};
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->per_channel_requant) {
+ flags |= 0x10;
+ ka.multiplier_ptr=qp->per_channel_muls + col_base;
+ ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+ }
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 141f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 113f\n"
+ "beq 85f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 57f\n"
+ "beq 29f\n"
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "blt 10f\n"
+ "cmp x11, #0x20\n"
+ "blt 9f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "bge 8b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "10:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 15f\n"
+ "cmp x11, #0x4\n"
+ "blt 12f\n"
+ "11:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ "bge 11b\n"
+ "cbz x11, 15f\n"
+ "12:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 13f\n"
+ "ldr h0, [x10], #0x2\n"
+ "tbz x11, #0, 14f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "b 14f\n"
+ "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "14:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "15:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "ldr q1, [x16, #0x10]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "tbz %x[flags], #4, 16f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 17f\n"
+ "16:" // Height 1: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "17:" // Height 1: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "tbz %x[flags], #5, 18f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "18:" // Height 1: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "bge 27f\n"
+ "tbz x15, #3, 22f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x15, #2, 20f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "tbz x15, #1, 19f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "b 26f\n"
+ "19:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "b 26f\n"
+ "20:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 21f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "b 26f\n"
+ "21:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "b 26f\n"
+ "22:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 24f\n"
+ "str s8, [x13], #0x4\n"
+ "tbz x15, #1, 23f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "b 26f\n"
+ "23:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "b 26f\n"
+ "24:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 25f\n"
+ "str h8, [x13], #0x2\n"
+ "tbz x15, #0, 26f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "b 26f\n"
+ "25:" // Height 1: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "26:" // Height 1: Partial direct writeback: Done
+ "b 28f\n"
+ "27:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "28:" // Height 1: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 3b\n"
+ "b 170f\n"
+ "29:" // Height 2
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "b 31f\n"
+ "30:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "31:" // Height 2: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "32:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "33:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 34f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 35f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 35f\n"
+ "34:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "35:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "blt 38f\n"
+ "cmp x11, #0x20\n"
+ "blt 37f\n"
+ "36:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "bge 36b\n"
+ "37:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "38:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 43f\n"
+ "cmp x11, #0x4\n"
+ "blt 40f\n"
+ "39:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 39b\n"
+ "cbz x11, 43f\n"
+ "40:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 41f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x11, #0, 42f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "b 42f\n"
+ "41:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "42:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "43:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 33b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "ldr q2, [x16, #0x20]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "tbz %x[flags], #4, 44f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 45f\n"
+ "44:" // Height 2: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "45:" // Height 2: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "tbz %x[flags], #5, 46f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "46:" // Height 2: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "bge 55f\n"
+ "tbz x15, #3, 50f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x15, #2, 48f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "tbz x15, #1, 47f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "b 54f\n"
+ "47:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "b 54f\n"
+ "48:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 49f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "b 54f\n"
+ "49:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "b 54f\n"
+ "50:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 52f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "tbz x15, #1, 51f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "b 54f\n"
+ "51:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "b 54f\n"
+ "52:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 53f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "tbz x15, #0, 54f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "b 54f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "54:" // Height 2: Partial direct writeback: Done
+ "b 56f\n"
+ "55:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "56:" // Height 2: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 31b\n"
+ "b 170f\n"
+ "57:" // Height 3
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "add x27, x27, x19\n"
+ "b 59f\n"
+ "58:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "59:" // Height 3: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "60:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "61:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 62f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 63f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 63f\n"
+ "62:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "63:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "blt 66f\n"
+ "cmp x11, #0x20\n"
+ "blt 65f\n"
+ "64:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "bge 64b\n"
+ "65:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "66:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 71f\n"
+ "cmp x11, #0x4\n"
+ "blt 68f\n"
+ "67:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 67b\n"
+ "cbz x11, 71f\n"
+ "68:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 69f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "tbz x11, #0, 70f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "b 70f\n"
+ "69:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "70:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "71:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 61b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add x16, x16, #0x40\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "tbz %x[flags], #4, 72f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 73f\n"
+ "72:" // Height 3: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "73:" // Height 3: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "tbz %x[flags], #5, 74f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "74:" // Height 3: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 83f\n"
+ "tbz x15, #3, 78f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x15, #2, 76f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "tbz x15, #1, 75f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "b 82f\n"
+ "75:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "b 82f\n"
+ "76:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 77f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "b 82f\n"
+ "77:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "b 82f\n"
+ "78:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 80f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "tbz x15, #1, 79f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "b 82f\n"
+ "79:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "b 82f\n"
+ "80:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 81f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "tbz x15, #0, 82f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "b 82f\n"
+ "81:" // Height 3: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "82:" // Height 3: Partial direct writeback: Done
+ "b 84f\n"
+ "83:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "84:" // Height 3: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 59b\n"
+ "b 170f\n"
+ "85:" // Height 4
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 86f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "b 87f\n"
+ "86:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "87:" // Height 4: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "88:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "89:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 90f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 91f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 91f\n"
+ "90:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "91:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "blt 94f\n"
+ "cmp x11, #0x20\n"
+ "blt 93f\n"
+ "92:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "bge 92b\n"
+ "93:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "94:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 99f\n"
+ "cmp x11, #0x4\n"
+ "blt 96f\n"
+ "95:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 95b\n"
+ "cbz x11, 99f\n"
+ "96:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 97f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "tbz x11, #0, 98f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "b 98f\n"
+ "97:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "98:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "99:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 89b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "tbz %x[flags], #4, 100f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 101f\n"
+ "100:" // Height 4: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "101:" // Height 4: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "tbz %x[flags], #5, 102f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "102:" // Height 4: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 111f\n"
+ "tbz x15, #3, 106f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x15, #2, 104f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x15, #1, 103f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "b 110f\n"
+ "103:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "b 110f\n"
+ "104:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 105f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "b 110f\n"
+ "105:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "b 110f\n"
+ "106:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 108f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x15, #1, 107f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "b 110f\n"
+ "107:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "b 110f\n"
+ "108:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 109f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "tbz x15, #0, 110f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "b 110f\n"
+ "109:" // Height 4: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "110:" // Height 4: Partial direct writeback: Done
+ "b 112f\n"
+ "111:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "112:" // Height 4: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 87b\n"
+ "b 170f\n"
+ "113:" // Height 5
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 114f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 115f\n"
+ "114:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "115:" // Height 5: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "116:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "117:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 118f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 119f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 119f\n"
+ "118:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "119:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "blt 122f\n"
+ "cmp x11, #0x20\n"
+ "blt 121f\n"
+ "120:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "bge 120b\n"
+ "121:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "122:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 127f\n"
+ "cmp x11, #0x4\n"
+ "blt 124f\n"
+ "123:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 123b\n"
+ "cbz x11, 127f\n"
+ "124:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 125f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x11, #0, 126f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 126f\n"
+ "125:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "126:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "127:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 117b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add x16, x16, #0x40\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "tbz %x[flags], #4, 128f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 129f\n"
+ "128:" // Height 5: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "129:" // Height 5: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+ "tbz %x[flags], #5, 130f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "and v5.16b, v25.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "and v6.16b, v26.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "and v7.16b, v27.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v5.4s\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "sqadd v27.4s, v27.4s, v7.4s\n"
+ "130:" // Height 5: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v3.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 139f\n"
+ "tbz x15, #3, 134f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x15, #2, 132f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x15, #1, 131f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "b 138f\n"
+ "131:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "b 138f\n"
+ "132:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 133f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "b 138f\n"
+ "133:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "b 138f\n"
+ "134:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 136f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x15, #1, 135f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "b 138f\n"
+ "135:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "b 138f\n"
+ "136:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 137f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "b 138f\n"
+ "137:" // Height 5: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "138:" // Height 5: Partial direct writeback: Done
+ "b 140f\n"
+ "139:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "140:" // Height 5: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 115b\n"
+ "b 170f\n"
+ "141:" // Height 6
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 142f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 143f\n"
+ "142:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "143:" // Height 6: Column loop
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "144:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "145:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 146f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 147f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 147f\n"
+ "146:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "147:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "blt 150f\n"
+ "cmp x11, #0x20\n"
+ "blt 149f\n"
+ "148:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "bge 148b\n"
+ "149:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "150:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 155f\n"
+ "cmp x11, #0x4\n"
+ "blt 152f\n"
+ "151:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 151b\n"
+ "cbz x11, 155f\n"
+ "152:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 153f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x11, #0, 154f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 154f\n"
+ "153:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "154:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "155:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 145b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ldr q0, [x16, #0x0]\n"
+ "add v8.4s, v8.4s, v0.4s\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ldr q1, [x16, #0x10]\n"
+ "add v12.4s, v12.4s, v0.4s\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ldr q2, [x16, #0x20]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "ldr q3, [x16, #0x30]\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "add x16, x16, #0x40\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "add v9.4s, v9.4s, v1.4s\n"
+ "add v10.4s, v10.4s, v2.4s\n"
+ "add v11.4s, v11.4s, v3.4s\n"
+ "add v13.4s, v13.4s, v1.4s\n"
+ "add v14.4s, v14.4s, v2.4s\n"
+ "add v15.4s, v15.4s, v3.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "tbz %x[flags], #4, 156f\n"
+ "ldr q0, [x17, #0x0]\n"
+ "ldr q4, [x8, #0x0]\n"
+ "ldr q1, [x17, #0x10]\n"
+ "ldr q5, [x8, #0x10]\n"
+ "ldr q2, [x17, #0x20]\n"
+ "ldr q6, [x8, #0x20]\n"
+ "ldr q3, [x17, #0x30]\n"
+ "ldr q7, [x8, #0x30]\n"
+ "add x17, x17, #0x40\n"
+ "add x8, x8, #0x40\n"
+ "b 157f\n"
+ "156:" // Height 6: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x19]\n"
+ "mov v1.16b, v0.16b\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "mov v2.16b, v0.16b\n"
+ "mov v3.16b, v0.16b\n"
+ "mov v5.16b, v4.16b\n"
+ "mov v6.16b, v4.16b\n"
+ "mov v7.16b, v4.16b\n"
+ "157:" // Height 6: parameters loaded
+ "sqrdmulh v8.4s, v8.4s, v4.4s\n"
+ "sqrdmulh v9.4s, v9.4s, v5.4s\n"
+ "sqrdmulh v10.4s, v10.4s, v6.4s\n"
+ "sqrdmulh v11.4s, v11.4s, v7.4s\n"
+ "sqrdmulh v12.4s, v12.4s, v4.4s\n"
+ "sqrdmulh v13.4s, v13.4s, v5.4s\n"
+ "sqrdmulh v14.4s, v14.4s, v6.4s\n"
+ "sqrdmulh v15.4s, v15.4s, v7.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v5.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v6.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v7.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v5.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v6.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v7.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v5.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v6.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v7.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v5.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v6.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v7.4s\n"
+ "tbz %x[flags], #5, 158f\n"
+ "and v4.16b, v8.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v9.16b, v1.16b\n"
+ "and v6.16b, v10.16b, v2.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v11.16b, v3.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v8.4s, v8.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v4.16b, v12.16b, v0.16b\n"
+ "sqadd v9.4s, v9.4s, v5.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v10.4s, v10.4s, v6.4s\n"
+ "and v5.16b, v13.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v11.4s, v11.4s, v7.4s\n"
+ "and v6.16b, v14.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v12.4s, v12.4s, v4.4s\n"
+ "and v7.16b, v15.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v13.4s, v13.4s, v5.4s\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v14.4s, v14.4s, v6.4s\n"
+ "and v5.16b, v17.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v15.4s, v15.4s, v7.4s\n"
+ "and v6.16b, v18.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v7.16b, v19.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "and v4.16b, v20.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "and v5.16b, v21.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "and v6.16b, v22.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v20.4s, v20.4s, v4.4s\n"
+ "and v7.16b, v23.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v21.4s, v21.4s, v5.4s\n"
+ "and v4.16b, v24.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v22.4s, v22.4s, v6.4s\n"
+ "and v5.16b, v25.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v23.4s, v23.4s, v7.4s\n"
+ "and v6.16b, v26.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v4.4s\n"
+ "and v7.16b, v27.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v5.4s\n"
+ "and v4.16b, v28.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v6.4s\n"
+ "and v5.16b, v29.16b, v1.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v7.4s\n"
+ "and v6.16b, v30.16b, v2.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v4.4s\n"
+ "and v7.16b, v31.16b, v3.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v29.4s, v29.4s, v5.4s\n"
+ "sqadd v30.4s, v30.4s, v6.4s\n"
+ "sqadd v31.4s, v31.4s, v7.4s\n"
+ "158:" // Height 6: no shift correction
+ "srshl v8.4s, v8.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v9.4s, v9.4s, v1.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v10.4s, v10.4s, v2.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v11.4s, v11.4s, v3.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x15, #0x10\n"
+ "srshl v12.4s, v12.4s, v0.4s\n"
+ "srshl v13.4s, v13.4s, v1.4s\n"
+ "srshl v14.4s, v14.4s, v2.4s\n"
+ "srshl v15.4s, v15.4s, v3.4s\n"
+ "add v8.4s, v8.4s, v4.4s\n"
+ "add v9.4s, v9.4s, v4.4s\n"
+ "add v10.4s, v10.4s, v4.4s\n"
+ "smin v8.4s, v8.4s, v6.4s\n"
+ "smin v9.4s, v9.4s, v6.4s\n"
+ "smin v10.4s, v10.4s, v6.4s\n"
+ "smax v8.4s, v8.4s, v5.4s\n"
+ "smax v9.4s, v9.4s, v5.4s\n"
+ "smax v10.4s, v10.4s, v5.4s\n"
+ "add v11.4s, v11.4s, v4.4s\n"
+ "add v12.4s, v12.4s, v4.4s\n"
+ "add v13.4s, v13.4s, v4.4s\n"
+ "smin v11.4s, v11.4s, v6.4s\n"
+ "smin v12.4s, v12.4s, v6.4s\n"
+ "smin v13.4s, v13.4s, v6.4s\n"
+ "smax v11.4s, v11.4s, v5.4s\n"
+ "smax v12.4s, v12.4s, v5.4s\n"
+ "smax v13.4s, v13.4s, v5.4s\n"
+ "add v14.4s, v14.4s, v4.4s\n"
+ "add v15.4s, v15.4s, v4.4s\n"
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "smin v14.4s, v14.4s, v6.4s\n"
+ "smin v15.4s, v15.4s, v6.4s\n"
+ "srshl v17.4s, v17.4s, v1.4s\n"
+ "smax v14.4s, v14.4s, v5.4s\n"
+ "smax v15.4s, v15.4s, v5.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "srshl v18.4s, v18.4s, v2.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "srshl v19.4s, v19.4s, v3.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "srshl v21.4s, v21.4s, v1.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "srshl v22.4s, v22.4s, v2.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "srshl v23.4s, v23.4s, v3.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v1.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v2.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v3.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "srshl v29.4s, v29.4s, v1.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v2.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "srshl v31.4s, v31.4s, v3.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v8.8h, v8.8h, v9.8h\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v9.8h, v10.8h, v11.8h\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v12.8h, v12.8h, v13.8h\n"
+ "uzp1 v13.8h, v14.8h, v15.8h\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v8.16b, v8.16b, v9.16b\n"
+ "uzp1 v12.16b, v12.16b, v13.16b\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 167f\n"
+ "tbz x15, #3, 162f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x15, #2, 160f\n"
+ "st1 { v8.s }[2], [x13], #0x4\n"
+ "st1 { v12.s }[2], [x9], #0x4\n"
+ "st1 { v16.s }[2], [x27], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x15, #1, 159f\n"
+ "st1 { v8.h }[6], [x13], #0x2\n"
+ "st1 { v12.h }[6], [x9], #0x2\n"
+ "st1 { v16.h }[6], [x27], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[14], [x13]\n"
+ "st1 { v12.b }[14], [x9]\n"
+ "st1 { v16.b }[14], [x27]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "st1 { v28.b }[14], [x21]\n"
+ "b 166f\n"
+ "159:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[12], [x13]\n"
+ "st1 { v12.b }[12], [x9]\n"
+ "st1 { v16.b }[12], [x27]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "st1 { v28.b }[12], [x21]\n"
+ "b 166f\n"
+ "160:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 161f\n"
+ "st1 { v8.h }[4], [x13], #0x2\n"
+ "st1 { v12.h }[4], [x9], #0x2\n"
+ "st1 { v16.h }[4], [x27], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[10], [x13]\n"
+ "st1 { v12.b }[10], [x9]\n"
+ "st1 { v16.b }[10], [x27]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "st1 { v28.b }[10], [x21]\n"
+ "b 166f\n"
+ "161:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[8], [x13]\n"
+ "st1 { v12.b }[8], [x9]\n"
+ "st1 { v16.b }[8], [x27]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "st1 { v28.b }[8], [x21]\n"
+ "b 166f\n"
+ "162:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 164f\n"
+ "str s8, [x13], #0x4\n"
+ "str s12, [x9], #0x4\n"
+ "str s16, [x27], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x15, #1, 163f\n"
+ "st1 { v8.h }[2], [x13], #0x2\n"
+ "st1 { v12.h }[2], [x9], #0x2\n"
+ "st1 { v16.h }[2], [x27], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[6], [x13]\n"
+ "st1 { v12.b }[6], [x9]\n"
+ "st1 { v16.b }[6], [x27]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "st1 { v28.b }[6], [x21]\n"
+ "b 166f\n"
+ "163:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[4], [x13]\n"
+ "st1 { v12.b }[4], [x9]\n"
+ "st1 { v16.b }[4], [x27]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "st1 { v28.b }[4], [x21]\n"
+ "b 166f\n"
+ "164:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 165f\n"
+ "str h8, [x13], #0x2\n"
+ "str h12, [x9], #0x2\n"
+ "str h16, [x27], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x15, #0, 166f\n"
+ "st1 { v8.b }[2], [x13]\n"
+ "st1 { v12.b }[2], [x9]\n"
+ "st1 { v16.b }[2], [x27]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x21]\n"
+ "b 166f\n"
+ "165:" // Height 6: Partial direct writeback: partial_1_0
+ "str b8, [x13, #0x0]\n"
+ "str b12, [x9, #0x0]\n"
+ "str b16, [x27, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
+ "166:" // Height 6: Partial direct writeback: Done
+ "b 168f\n"
+ "167:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
+ "add x13, x13, #0x10\n"
+ "add x9, x9, #0x10\n"
+ "add x27, x27, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "168:" // Height 6: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 143b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 170f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 169f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "169:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "170:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
deleted file mode 100644
index 4a7cdc59a7..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const int8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(int8_t);
-
- int32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const int8_t *a_ptr0 = a_ptr0_base;
- const int8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- int32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
- int32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
deleted file mode 100644
index da39a32690..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_s8s32_dot_16x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const int8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(int8_t);
-
- int32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const int8_t *a_ptr0 = a_ptr0_base;
- const int8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- int32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(int32_t);
- int32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x4f84e110 // sdot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x4f85e114 // sdot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x4f86e118 // sdot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x4f87e11c // sdot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f84e131 // sdot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x4f85e135 // sdot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x4f86e139 // sdot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x4f87e13d // sdot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f84e152 // sdot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x4f85e156 // sdot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x4f86e15a // sdot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x4f87e15e // sdot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f84e173 // sdot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x4f85e177 // sdot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x4f86e17b // sdot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x4f87e17f // sdot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa4e190 // sdot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x4fa5e194 // sdot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x4fa6e198 // sdot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x4fa7e19c // sdot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa4e1b1 // sdot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1b5 // sdot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1b9 // sdot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1bd // sdot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa4e1d2 // sdot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1d6 // sdot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1da // sdot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1de // sdot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa4e1f3 // sdot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x4fa5e1f7 // sdot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x4fa6e1fb // sdot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x4fa7e1ff // sdot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f84e910 // sdot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f85e914 // sdot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x4f86e918 // sdot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x4f87e91c // sdot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x4f84e931 // sdot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x4f85e935 // sdot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x4f86e939 // sdot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x4f87e93d // sdot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x4f84e952 // sdot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x4f85e956 // sdot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x4f86e95a // sdot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x4f87e95e // sdot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x4f84e973 // sdot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x4f85e977 // sdot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x4f86e97b // sdot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x4f87e97f // sdot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x4fa4e990 // sdot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x4fa5e994 // sdot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x4fa6e998 // sdot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x4fa7e99c // sdot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9b1 // sdot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9b5 // sdot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9b9 // sdot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9bd // sdot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9d2 // sdot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9d6 // sdot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9da // sdot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9de // sdot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x4fa4e9f3 // sdot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x4fa5e9f7 // sdot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x4fa6e9fb // sdot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x4fa7e9ff // sdot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4fa0e190 // sdot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x4fa1e194 // sdot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x4fa2e198 // sdot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x4fa3e19c // sdot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x4fa0e1b1 // sdot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1b5 // sdot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1b9 // sdot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1bd // sdot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x4fa0e1d2 // sdot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1d6 // sdot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1da // sdot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1de // sdot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x4fa0e1f3 // sdot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x4fa1e1f7 // sdot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x4fa2e1fb // sdot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x4fa3e1ff // sdot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x4f80e910 // sdot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x4f81e914 // sdot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x4f82e918 // sdot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x4f83e91c // sdot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x4f80e931 // sdot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x4f81e935 // sdot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x4f82e939 // sdot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x4f83e93d // sdot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x4f80e952 // sdot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x4f81e956 // sdot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x4f82e95a // sdot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x4f83e95e // sdot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x4f80e973 // sdot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x4f81e977 // sdot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x4f82e97b // sdot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x4f83e97f // sdot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x4fa0e990 // sdot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x4fa1e994 // sdot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x4fa2e998 // sdot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x4fa3e99c // sdot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9b1 // sdot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9b5 // sdot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9b9 // sdot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9bd // sdot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9d2 // sdot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9d6 // sdot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9da // sdot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9de // sdot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x4fa0e9f3 // sdot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x4fa1e9f7 // sdot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x4fa2e9fb // sdot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x4fa3e9ff // sdot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x4f80e110 // sdot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x4f81e114 // sdot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x4f82e118 // sdot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x4f83e11c // sdot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x4f80e131 // sdot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x4f81e135 // sdot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x4f82e139 // sdot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x4f83e13d // sdot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x4f80e152 // sdot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x4f81e156 // sdot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x4f82e15a // sdot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x4f83e15e // sdot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x4f80e173 // sdot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x4f81e177 // sdot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x4f82e17b // sdot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x4f83e17f // sdot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
new file mode 100644
index 0000000000..16a6f9213a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int32_t>, \
+ const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_s8s32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_s8s32_dot_6x16
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_s8s32_dot_6x16;
+
+ cls_a64_hybrid_s8s32_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3257986410
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_s8s32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_s8s32_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+ const int32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 176f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 141f\n"
+ "beq 106f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 71f\n"
+ "beq 36f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "tbz %x[flags], #0, 13f\n"
+ "cmp x15, #0x10\n"
+ "bge 12f\n"
+ "tbz x15, #3, 7f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 5f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 4f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 11f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 11f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x15, #1, 6f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 11f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 11f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x15, #2, 9f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 8f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 11f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 11f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x15, #1, 10f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 11f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "11:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 14f\n"
+ "12:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 14f\n"
+ "13:" // Height 1: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "14:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "15:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 16f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 17f\n"
+ "16:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "17:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "blt 20f\n"
+ "cmp x11, #0x20\n"
+ "blt 19f\n"
+ "18:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "bge 18b\n"
+ "19:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ "20:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 25f\n"
+ "cmp x11, #0x4\n"
+ "blt 22f\n"
+ "21:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ "bge 21b\n"
+ "cbz x11, 25f\n"
+ "22:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 23f\n"
+ "ldr h0, [x10], #0x2\n"
+ "tbz x11, #0, 24f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "b 24f\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "24:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ "25:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 15b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 34f\n"
+ "tbz x15, #3, 29f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 27f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 26f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 33f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 33f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 33f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 28f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 33f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 33f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 33f\n"
+ "29:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 31f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 30f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 33f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 33f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 33f\n"
+ "31:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 32f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 33f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "33:" // Height 1: Partial direct writeback: Done
+ "b 35f\n"
+ "34:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "35:" // Height 1: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 3b\n"
+ "b 212f\n"
+ "36:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 37f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "38:" // Height 2: Column loop
+ "tbz %x[flags], #0, 48f\n"
+ "cmp x15, #0x10\n"
+ "bge 47f\n"
+ "tbz x15, #3, 42f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 40f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 39f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 46f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 46f\n"
+ "40:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x15, #1, 41f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 46f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 46f\n"
+ "42:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x15, #2, 44f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 43f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 46f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 46f\n"
+ "44:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x15, #1, 45f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 46f\n"
+ "45:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "46:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 49f\n"
+ "47:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 49f\n"
+ "48:" // Height 2: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "49:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "50:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 52f\n"
+ "51:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "52:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "blt 55f\n"
+ "cmp x11, #0x20\n"
+ "blt 54f\n"
+ "53:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "bge 53b\n"
+ "54:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ "55:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 60f\n"
+ "cmp x11, #0x4\n"
+ "blt 57f\n"
+ "56:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 56b\n"
+ "cbz x11, 60f\n"
+ "57:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 58f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x11, #0, 59f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "b 59f\n"
+ "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "59:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ "60:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 69f\n"
+ "tbz x15, #3, 64f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 62f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 61f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 68f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 68f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 68f\n"
+ "62:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 63f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 68f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 68f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 68f\n"
+ "64:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 66f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 65f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 68f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 68f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 68f\n"
+ "66:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 67f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 68f\n"
+ "67:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "68:" // Height 2: Partial direct writeback: Done
+ "b 70f\n"
+ "69:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "70:" // Height 2: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 38b\n"
+ "b 212f\n"
+ "71:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "73:" // Height 3: Column loop
+ "tbz %x[flags], #0, 83f\n"
+ "cmp x15, #0x10\n"
+ "bge 82f\n"
+ "tbz x15, #3, 77f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 75f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 74f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 81f\n"
+ "74:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 81f\n"
+ "75:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x15, #1, 76f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 81f\n"
+ "76:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 81f\n"
+ "77:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x15, #2, 79f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 78f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 81f\n"
+ "78:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 81f\n"
+ "79:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x15, #1, 80f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 81f\n"
+ "80:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "81:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 84f\n"
+ "82:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 84f\n"
+ "83:" // Height 3: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "84:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "85:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 86f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 87f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 87f\n"
+ "86:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "87:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "blt 90f\n"
+ "cmp x11, #0x20\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "bge 88b\n"
+ "89:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ "90:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 95f\n"
+ "cmp x11, #0x4\n"
+ "blt 92f\n"
+ "91:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 91b\n"
+ "cbz x11, 95f\n"
+ "92:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 93f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "tbz x11, #0, 94f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "b 94f\n"
+ "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "94:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ "95:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 85b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "bge 104f\n"
+ "tbz x15, #3, 99f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 97f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 96f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 103f\n"
+ "96:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 103f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 103f\n"
+ "97:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 98f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 103f\n"
+ "98:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 103f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 103f\n"
+ "99:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 101f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 100f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 103f\n"
+ "100:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 103f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 103f\n"
+ "101:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 102f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 103f\n"
+ "102:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "103:" // Height 3: Partial direct writeback: Done
+ "b 105f\n"
+ "104:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "105:" // Height 3: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 73b\n"
+ "b 212f\n"
+ "106:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 107f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 108f\n"
+ "107:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "108:" // Height 4: Column loop
+ "tbz %x[flags], #0, 118f\n"
+ "cmp x15, #0x10\n"
+ "bge 117f\n"
+ "tbz x15, #3, 112f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 110f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 109f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 116f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 116f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x15, #1, 111f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 116f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 116f\n"
+ "112:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x15, #2, 114f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 113f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 116f\n"
+ "113:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 116f\n"
+ "114:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x15, #1, 115f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 116f\n"
+ "115:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "116:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 119f\n"
+ "117:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 119f\n"
+ "118:" // Height 4: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "119:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "120:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 122f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 122f\n"
+ "121:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "122:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "blt 125f\n"
+ "cmp x11, #0x20\n"
+ "blt 124f\n"
+ "123:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "bge 123b\n"
+ "124:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ "125:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 130f\n"
+ "cmp x11, #0x4\n"
+ "blt 127f\n"
+ "126:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 126b\n"
+ "cbz x11, 130f\n"
+ "127:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 128f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "tbz x11, #0, 129f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "b 129f\n"
+ "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "129:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ "130:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 120b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "bge 139f\n"
+ "tbz x15, #3, 134f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 132f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 131f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 138f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 138f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 138f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 133f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 138f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 138f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 138f\n"
+ "134:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 136f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 135f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 138f\n"
+ "135:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 138f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 138f\n"
+ "136:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 137f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 138f\n"
+ "137:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "138:" // Height 4: Partial direct writeback: Done
+ "b 140f\n"
+ "139:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "140:" // Height 4: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 108b\n"
+ "b 212f\n"
+ "141:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 142f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 143f\n"
+ "142:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "143:" // Height 5: Column loop
+ "tbz %x[flags], #0, 153f\n"
+ "cmp x15, #0x10\n"
+ "bge 152f\n"
+ "tbz x15, #3, 147f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 145f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 144f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 151f\n"
+ "144:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 151f\n"
+ "145:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x15, #1, 146f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 151f\n"
+ "146:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 151f\n"
+ "147:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x15, #2, 149f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 148f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 151f\n"
+ "148:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 151f\n"
+ "149:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x15, #1, 150f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 151f\n"
+ "150:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "151:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 154f\n"
+ "152:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 154f\n"
+ "153:" // Height 5: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "154:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "155:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 156f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 157f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 157f\n"
+ "156:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "157:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "blt 160f\n"
+ "cmp x11, #0x20\n"
+ "blt 159f\n"
+ "158:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "bge 158b\n"
+ "159:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ "160:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 165f\n"
+ "cmp x11, #0x4\n"
+ "blt 162f\n"
+ "161:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 161b\n"
+ "cbz x11, 165f\n"
+ "162:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 163f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x11, #0, 164f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 164f\n"
+ "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "164:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ "165:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 155b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 174f\n"
+ "tbz x15, #3, 169f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 167f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 166f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 173f\n"
+ "166:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 173f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 173f\n"
+ "167:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 168f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 173f\n"
+ "168:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 173f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 173f\n"
+ "169:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 171f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 170f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 173f\n"
+ "170:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 173f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 173f\n"
+ "171:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 172f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 173f\n"
+ "172:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "173:" // Height 5: Partial direct writeback: Done
+ "b 175f\n"
+ "174:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "175:" // Height 5: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 143b\n"
+ "b 212f\n"
+ "176:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 177f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 178f\n"
+ "177:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "178:" // Height 6: Column loop
+ "tbz %x[flags], #0, 188f\n"
+ "cmp x15, #0x10\n"
+ "bge 187f\n"
+ "tbz x15, #3, 182f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 180f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 179f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 186f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 186f\n"
+ "180:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x15, #1, 181f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 186f\n"
+ "181:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 186f\n"
+ "182:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x15, #2, 184f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 183f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 186f\n"
+ "183:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 186f\n"
+ "184:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x15, #1, 185f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 186f\n"
+ "185:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "186:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 189f\n"
+ "187:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 189f\n"
+ "188:" // Height 6: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "189:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "190:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 191f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 192f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 192f\n"
+ "191:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "192:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "blt 195f\n"
+ "cmp x11, #0x20\n"
+ "blt 194f\n"
+ "193:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "bge 193b\n"
+ "194:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x4fa0e0c8 // sdot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0cc // sdot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d0 // sdot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d4 // sdot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0d8 // sdot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0dc // sdot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x4fa0e0e9 // sdot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ed // sdot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f1 // sdot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f5 // sdot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0f9 // sdot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0fd // sdot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x4fa0e0ca // sdot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ce // sdot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0d2 // sdot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0d6 // sdot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0da // sdot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0de // sdot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x4fa0e0eb // sdot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e0ef // sdot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e0f3 // sdot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e0f7 // sdot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e0fb // sdot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e0ff // sdot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x4f80e8c8 // sdot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8cc // sdot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d0 // sdot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d4 // sdot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8d8 // sdot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8dc // sdot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x4f80e8e9 // sdot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ed // sdot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f1 // sdot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f5 // sdot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8f9 // sdot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8fd // sdot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x4f80e8ca // sdot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ce // sdot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8d2 // sdot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8d6 // sdot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8da // sdot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8de // sdot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x4f80e8eb // sdot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x4f81e8ef // sdot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x4f82e8f3 // sdot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x4f83e8f7 // sdot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x4f84e8fb // sdot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x4f85e8ff // sdot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x4fa0e8c8 // sdot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8cc // sdot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d0 // sdot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d4 // sdot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8d8 // sdot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8dc // sdot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x4fa0e8e9 // sdot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ed // sdot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f1 // sdot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f5 // sdot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8f9 // sdot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8fd // sdot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x4fa0e8ca // sdot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x4fa1e8ce // sdot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8d2 // sdot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8d6 // sdot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8da // sdot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8de // sdot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x4fa0e8eb // sdot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x4fa1e8ef // sdot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x4fa2e8f3 // sdot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x4fa3e8f7 // sdot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x4fa4e8fb // sdot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x4fa5e8ff // sdot v31.4s, v7.16b, v5.4b[3]\n"
+ "195:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 200f\n"
+ "cmp x11, #0x4\n"
+ "blt 197f\n"
+ "196:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 196b\n"
+ "cbz x11, 200f\n"
+ "197:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 198f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x11, #0, 199f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 199f\n"
+ "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "199:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x4f80e0c8 // sdot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x4f81e0cc // sdot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d0 // sdot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d4 // sdot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0d8 // sdot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0dc // sdot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x4f80e0e9 // sdot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ed // sdot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f1 // sdot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f5 // sdot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0f9 // sdot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0fd // sdot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x4f80e0ca // sdot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x4f81e0ce // sdot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0d2 // sdot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0d6 // sdot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0da // sdot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0de // sdot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x4f80e0eb // sdot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x4f81e0ef // sdot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x4f82e0f3 // sdot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x4f83e0f7 // sdot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x4f84e0fb // sdot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x4f85e0ff // sdot v31.4s, v7.16b, v5.4b[0]\n"
+ "200:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 190b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 209f\n"
+ "tbz x15, #3, 204f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 202f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 201f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 208f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 208f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 208f\n"
+ "202:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 203f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 208f\n"
+ "203:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 208f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 208f\n"
+ "204:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 206f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 205f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 208f\n"
+ "205:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 208f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 208f\n"
+ "206:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 207f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 208f\n"
+ "207:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "208:" // Height 6: Partial direct writeback: Done
+ "b 210f\n"
+ "209:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "210:" // Height 6: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 178b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 212f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 211f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "211:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "212:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
index e5a88b4519..5b4a7f3e86 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,38 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __aarch64__
-#include <cstdint>
#include "../std_transforms_fixed.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_hybrid_u8qa_dot_4x16( ARGLIST );
-class hybrid_u8u32_dot_16x4
+class cls_a64_hybrid_u8qa_dot_4x16
{
public:
typedef uint8_t operand_type;
- typedef uint32_t result_type;
+ typedef uint8_t result_type;
- typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -61,32 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsFixed<operand_type, result_type, 4, 16, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_hybrid_u8u32_dot_16x4;
+ kern_type kernel=a64_hybrid_u8qa_dot_4x16;
- hybrid_u8u32_dot_16x4(const CPUInfo *ci)
+ cls_a64_hybrid_u8qa_dot_4x16(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_hybrid_u8u32_dot_16x4_a55;
- }
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
new file mode 100644
index 0000000000..ff12472063
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8qa_dot_4x16/generic.cpp
@@ -0,0 +1,2072 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8qa_dot_4x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 94f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 63f\n"
+ "beq 32f\n"
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "blt 12f\n"
+ "cmp x27, #0x20\n"
+ "blt 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "ldr q5, [x11, #0x80]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0x80]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ "tbnz %x[flags], #31, 11f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "11:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "12:" // Height 1: Multiply loop: Main loop skip
+ "cbz x27, 19f\n"
+ "cmp x27, #0x4\n"
+ "blt 15f\n"
+ "13:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "tbnz %x[flags], #31, 14f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "14:" // Height 1: Multiply loop: unique 3: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ "bge 13b\n"
+ "cbz x27, 19f\n"
+ "15:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 16f\n"
+ "ldr h0, [x26], #0x2\n"
+ "tbz x27, #0, 17f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "b 17f\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "17:" // Height 1: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 18f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ "18:" // Height 1: Multiply loop: unique 4: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ "19:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 20f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "ld1r { v1.4s }, [x19]\n"
+ "neg v1.4s, v1.4s\n"
+ "mul v11.4s, v11.4s, v1.4s\n"
+ "20:" // Height 1: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add x10, x10, #0x40\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "tbz %x[flags], #5, 21f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "21:" // Height 1: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "bge 30f\n"
+ "tbz x12, #3, 25f\n"
+ "str d16, [x9], #0x8\n"
+ "tbz x12, #2, 23f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "tbz x12, #1, 22f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "b 29f\n"
+ "22:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "b 29f\n"
+ "23:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 24f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "b 29f\n"
+ "24:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "b 29f\n"
+ "25:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 27f\n"
+ "str s16, [x9], #0x4\n"
+ "tbz x12, #1, 26f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "b 29f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "b 29f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 28f\n"
+ "str h16, [x9], #0x2\n"
+ "tbz x12, #0, 29f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "b 29f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "29:" // Height 1: Partial direct writeback: Done
+ "b 31f\n"
+ "30:" // Height 1: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "31:" // Height 1: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 3b\n"
+ "b 126f\n"
+ "32:" // Height 2
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 33f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 34f\n"
+ "33:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "34:" // Height 2: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "35:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "36:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 37f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 38f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "38:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "blt 43f\n"
+ "cmp x27, #0x20\n"
+ "blt 41f\n"
+ "39:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 40f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "40:" // Height 2: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "bge 39b\n"
+ "41:" // Height 2: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
+ "tbnz %x[flags], #31, 42f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "42:" // Height 2: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "43:" // Height 2: Multiply loop: Main loop skip
+ "cbz x27, 50f\n"
+ "cmp x27, #0x4\n"
+ "blt 46f\n"
+ "44:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "tbnz %x[flags], #31, 45f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "45:" // Height 2: Multiply loop: unique 7: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ "bge 44b\n"
+ "cbz x27, 50f\n"
+ "46:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 47f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "tbz x27, #0, 48f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "b 48f\n"
+ "47:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "48:" // Height 2: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 49f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ "49:" // Height 2: Multiply loop: unique 8: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
+ "50:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 36b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 51f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "neg v2.4s, v2.4s\n"
+ "mul v11.4s, v11.4s, v2.4s\n"
+ "mul v12.4s, v12.4s, v2.4s\n"
+ "51:" // Height 2: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "tbz %x[flags], #5, 52f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "52:" // Height 2: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "bge 61f\n"
+ "tbz x12, #3, 56f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x12, #2, 54f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "tbz x12, #1, 53f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "b 60f\n"
+ "53:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "b 60f\n"
+ "54:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 55f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "b 60f\n"
+ "55:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "b 60f\n"
+ "56:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 58f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "tbz x12, #1, 57f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "b 60f\n"
+ "57:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "b 60f\n"
+ "58:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 59f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "tbz x12, #0, 60f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "b 60f\n"
+ "59:" // Height 2: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "60:" // Height 2: Partial direct writeback: Done
+ "b 62f\n"
+ "61:" // Height 2: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "62:" // Height 2: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 34b\n"
+ "b 126f\n"
+ "63:" // Height 3
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 64f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 65f\n"
+ "64:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "65:" // Height 3: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "66:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "67:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 68f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 69f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 69f\n"
+ "68:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "69:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "blt 74f\n"
+ "cmp x27, #0x20\n"
+ "blt 72f\n"
+ "70:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ "ldr q8, [x11, #0x40]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 71f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "71:" // Height 3: Multiply loop: unique 9: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bge 70b\n"
+ "72:" // Height 3: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ "ldr q10, [x11, #0x40]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ "tbnz %x[flags], #31, 73f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "73:" // Height 3: Multiply loop: unique 10: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "74:" // Height 3: Multiply loop: Main loop skip
+ "cbz x27, 81f\n"
+ "cmp x27, #0x4\n"
+ "blt 77f\n"
+ "75:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "tbnz %x[flags], #31, 76f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "76:" // Height 3: Multiply loop: unique 11: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ "sub x27, x27, #0x4\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ "bge 75b\n"
+ "cbz x27, 81f\n"
+ "77:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 78f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "tbz x27, #0, 79f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "b 79f\n"
+ "78:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "79:" // Height 3: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 80f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ "80:" // Height 3: Multiply loop: unique 12: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n"
+ "81:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 67b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 82f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v3.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "neg v3.4s, v3.4s\n"
+ "mul v11.4s, v11.4s, v3.4s\n"
+ "mul v12.4s, v12.4s, v3.4s\n"
+ "mul v13.4s, v13.4s, v3.4s\n"
+ "82:" // Height 3: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "tbz %x[flags], #5, 83f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "83:" // Height 3: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "bge 92f\n"
+ "tbz x12, #3, 87f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x12, #2, 85f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "tbz x12, #1, 84f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "b 91f\n"
+ "84:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "b 91f\n"
+ "85:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 86f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "b 91f\n"
+ "86:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "b 91f\n"
+ "87:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 89f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "tbz x12, #1, 88f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "b 91f\n"
+ "88:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "b 91f\n"
+ "89:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 90f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "tbz x12, #0, 91f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "b 91f\n"
+ "90:" // Height 3: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "91:" // Height 3: Partial direct writeback: Done
+ "b 93f\n"
+ "92:" // Height 3: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "93:" // Height 3: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 65b\n"
+ "b 126f\n"
+ "94:" // Height 4
+ "movi v11.4s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "movi v12.4s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "movi v13.4s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.16b, #0x1\n"
+ "tbz %x[flags], #2, 95f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 96f\n"
+ "95:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "96:" // Height 4: Column loop
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "97:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "98:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 99f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 100f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 100f\n"
+ "99:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "100:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "blt 105f\n"
+ "cmp x27, #0x20\n"
+ "blt 103f\n"
+ "101:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q4, [x11, #0x0]\n"
+ ".inst 0x6f80e090 // udot v16.4s, v4.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x10]\n"
+ ".inst 0x6f81e094 // udot v20.4s, v4.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x20]\n"
+ ".inst 0x6f82e098 // udot v24.4s, v4.16b, v2.4b[0]\n"
+ "ldr q7, [x11, #0x30]\n"
+ ".inst 0x6f83e09c // udot v28.4s, v4.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x40]\n"
+ "ldr q9, [x11, #0x50]\n"
+ ".inst 0x6f80e0b1 // udot v17.4s, v5.16b, v0.4b[0]\n"
+ "ldr q10, [x11, #0x60]\n"
+ ".inst 0x6f81e0b5 // udot v21.4s, v5.16b, v1.4b[0]\n"
+ "ldr q4, [x11, #0x70]\n"
+ ".inst 0x6f82e0b9 // udot v25.4s, v5.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f83e0bd // udot v29.4s, v5.16b, v3.4b[0]\n"
+ "ldr q5, [x11, #0x80]\n"
+ ".inst 0x6f80e0d2 // udot v18.4s, v6.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e0d6 // udot v22.4s, v6.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0da // udot v26.4s, v6.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f83e0de // udot v30.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x11, #0x90]\n"
+ ".inst 0x6f80e0f3 // udot v19.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f7 // udot v23.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fb // udot v27.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0ff // udot v31.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0xa0]\n"
+ ".inst 0x6fa0e110 // udot v16.4s, v8.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e114 // udot v20.4s, v8.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e118 // udot v24.4s, v8.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e11c // udot v28.4s, v8.16b, v3.4b[1]\n"
+ "ldr q8, [x11, #0xb0]\n"
+ ".inst 0x6fa0e131 // udot v17.4s, v9.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e135 // udot v21.4s, v9.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e139 // udot v25.4s, v9.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e13d // udot v29.4s, v9.16b, v3.4b[1]\n"
+ "ldr q9, [x11, #0xc0]\n"
+ ".inst 0x6fa0e152 // udot v18.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e156 // udot v22.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e15a // udot v26.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e15e // udot v30.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xd0]\n"
+ ".inst 0x6fa0e093 // udot v19.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e097 // udot v23.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e09b // udot v27.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e09f // udot v31.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xe0]\n"
+ ".inst 0x6f80e8b0 // udot v16.4s, v5.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8b4 // udot v20.4s, v5.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8b8 // udot v24.4s, v5.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8bc // udot v28.4s, v5.16b, v3.4b[2]\n"
+ "ldr q5, [x11, #0xf0]\n"
+ ".inst 0x6f80e8d1 // udot v17.4s, v6.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e8d5 // udot v21.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d9 // udot v25.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8dd // udot v29.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f80e8f2 // udot v18.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f6 // udot v22.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8fa // udot v26.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8fe // udot v30.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f80e913 // udot v19.4s, v8.16b, v0.4b[2]\n"
+ ".inst 0x6f81e917 // udot v23.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e91b // udot v27.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e91f // udot v31.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e930 // udot v16.4s, v9.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e934 // udot v20.4s, v9.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e938 // udot v24.4s, v9.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e93c // udot v28.4s, v9.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e951 // udot v17.4s, v10.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e955 // udot v21.4s, v10.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e959 // udot v25.4s, v10.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e95d // udot v29.4s, v10.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e892 // udot v18.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e896 // udot v22.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e89a // udot v26.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e89e // udot v30.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8b3 // udot v19.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b7 // udot v23.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8bb // udot v27.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8bf // udot v31.4s, v5.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 102f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "102:" // Height 4: Multiply loop: unique 13: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x20\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bge 101b\n"
+ "103:" // Height 4: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q0, [x26, #0x0]\n"
+ "ldr q1, [x24, #0x0]\n"
+ "ldr q2, [x22, #0x0]\n"
+ "ldr q3, [x20, #0x0]\n"
+ "ldr q6, [x11, #0x0]\n"
+ ".inst 0x6f80e0d0 // udot v16.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x11, #0x10]\n"
+ ".inst 0x6f81e0d4 // udot v20.4s, v6.16b, v1.4b[0]\n"
+ "ldr q8, [x11, #0x20]\n"
+ ".inst 0x6f82e0d8 // udot v24.4s, v6.16b, v2.4b[0]\n"
+ "ldr q9, [x11, #0x30]\n"
+ ".inst 0x6f83e0dc // udot v28.4s, v6.16b, v3.4b[0]\n"
+ "ldr q10, [x11, #0x40]\n"
+ "ldr q4, [x11, #0x50]\n"
+ ".inst 0x6f80e0f1 // udot v17.4s, v7.16b, v0.4b[0]\n"
+ "ldr q5, [x11, #0x60]\n"
+ ".inst 0x6f81e0f5 // udot v21.4s, v7.16b, v1.4b[0]\n"
+ "ldr q6, [x11, #0x70]\n"
+ ".inst 0x6f82e0f9 // udot v25.4s, v7.16b, v2.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f83e0fd // udot v29.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x11, #0x80]\n"
+ ".inst 0x6f80e112 // udot v18.4s, v8.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f81e116 // udot v22.4s, v8.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e11a // udot v26.4s, v8.16b, v2.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f83e11e // udot v30.4s, v8.16b, v3.4b[0]\n"
+ "ldr q8, [x11, #0x90]\n"
+ ".inst 0x6f80e133 // udot v19.4s, v9.16b, v0.4b[0]\n"
+ ".inst 0x6f81e137 // udot v23.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x6f82e13b // udot v27.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f83e13f // udot v31.4s, v9.16b, v3.4b[0]\n"
+ "ldr q9, [x11, #0xa0]\n"
+ ".inst 0x6fa0e150 // udot v16.4s, v10.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e154 // udot v20.4s, v10.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e158 // udot v24.4s, v10.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e15c // udot v28.4s, v10.16b, v3.4b[1]\n"
+ "ldr q10, [x11, #0xb0]\n"
+ ".inst 0x6fa0e091 // udot v17.4s, v4.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e095 // udot v21.4s, v4.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e099 // udot v25.4s, v4.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e09d // udot v29.4s, v4.16b, v3.4b[1]\n"
+ "ldr q4, [x11, #0xc0]\n"
+ ".inst 0x6fa0e0b2 // udot v18.4s, v5.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0b6 // udot v22.4s, v5.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0ba // udot v26.4s, v5.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0be // udot v30.4s, v5.16b, v3.4b[1]\n"
+ "ldr q5, [x11, #0xd0]\n"
+ ".inst 0x6fa0e0d3 // udot v19.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0d7 // udot v23.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0db // udot v27.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0df // udot v31.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x11, #0xe0]\n"
+ ".inst 0x6f80e8f0 // udot v16.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8f4 // udot v20.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f8 // udot v24.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8fc // udot v28.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x11, #0xf0]\n"
+ ".inst 0x6f80e911 // udot v17.4s, v8.16b, v0.4b[2]\n"
+ "add x11, x11, #0x100\n"
+ ".inst 0x6f81e915 // udot v21.4s, v8.16b, v1.4b[2]\n"
+ ".inst 0x6f82e919 // udot v25.4s, v8.16b, v2.4b[2]\n"
+ ".inst 0x6f83e91d // udot v29.4s, v8.16b, v3.4b[2]\n"
+ ".inst 0x6f80e932 // udot v18.4s, v9.16b, v0.4b[2]\n"
+ ".inst 0x6f81e936 // udot v22.4s, v9.16b, v1.4b[2]\n"
+ ".inst 0x6f82e93a // udot v26.4s, v9.16b, v2.4b[2]\n"
+ ".inst 0x6f83e93e // udot v30.4s, v9.16b, v3.4b[2]\n"
+ ".inst 0x6f80e953 // udot v19.4s, v10.16b, v0.4b[2]\n"
+ ".inst 0x6f81e957 // udot v23.4s, v10.16b, v1.4b[2]\n"
+ ".inst 0x6f82e95b // udot v27.4s, v10.16b, v2.4b[2]\n"
+ ".inst 0x6f83e95f // udot v31.4s, v10.16b, v3.4b[2]\n"
+ ".inst 0x6fa0e890 // udot v16.4s, v4.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e894 // udot v20.4s, v4.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e898 // udot v24.4s, v4.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e89c // udot v28.4s, v4.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8b1 // udot v17.4s, v5.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8b5 // udot v21.4s, v5.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8b9 // udot v25.4s, v5.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8bd // udot v29.4s, v5.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8d2 // udot v18.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8d6 // udot v22.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8da // udot v26.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8de // udot v30.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8f3 // udot v19.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8f7 // udot v23.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8fb // udot v27.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8ff // udot v31.4s, v7.16b, v3.4b[3]\n"
+ "tbnz %x[flags], #31, 104f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "104:" // Height 4: Multiply loop: unique 14: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "105:" // Height 4: Multiply loop: Main loop skip
+ "cbz x27, 112f\n"
+ "cmp x27, #0x4\n"
+ "blt 108f\n"
+ "106:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x26], #0x4\n"
+ "ldr s1, [x24], #0x4\n"
+ "ldr s2, [x22], #0x4\n"
+ "ldr s3, [x20], #0x4\n"
+ "tbnz %x[flags], #31, 107f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "107:" // Height 4: Multiply loop: unique 15: skip row sum
+ "ldr q8, [x11, #0x0]\n"
+ ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
+ "ldr q9, [x11, #0x10]\n"
+ ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
+ "ldr q10, [x11, #0x20]\n"
+ ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
+ "ldr q4, [x11, #0x30]\n"
+ ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
+ "sub x27, x27, #0x4\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
+ "cmp x27, #0x4\n"
+ ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
+ ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
+ ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
+ ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
+ ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
+ ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
+ ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
+ ".inst 0x6f80e093 // udot v19.4s, v4.16b, v0.4b[0]\n"
+ ".inst 0x6f81e097 // udot v23.4s, v4.16b, v1.4b[0]\n"
+ ".inst 0x6f82e09b // udot v27.4s, v4.16b, v2.4b[0]\n"
+ ".inst 0x6f83e09f // udot v31.4s, v4.16b, v3.4b[0]\n"
+ "bge 106b\n"
+ "cbz x27, 112f\n"
+ "108:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x27, #1, 109f\n"
+ "ldr h0, [x26], #0x2\n"
+ "ldr h1, [x24], #0x2\n"
+ "ldr h2, [x22], #0x2\n"
+ "ldr h3, [x20], #0x2\n"
+ "tbz x27, #0, 110f\n"
+ "ld1 { v0.b }[2], [x26]\n"
+ "ld1 { v1.b }[2], [x24]\n"
+ "ld1 { v2.b }[2], [x22]\n"
+ "ld1 { v3.b }[2], [x20]\n"
+ "b 110f\n"
+ "109:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x26, #0x0]\n"
+ "ldr b1, [x24, #0x0]\n"
+ "ldr b2, [x22, #0x0]\n"
+ "ldr b3, [x20, #0x0]\n"
+ "110:" // Height 4: Multiply loop: Ragged operand read: Done
+ "tbnz %x[flags], #31, 111f\n"
+ ".inst 0x6e8f940b // udot v11.4s, v0.16b, v15.16b\n"
+ ".inst 0x6e8f942c // udot v12.4s, v1.16b, v15.16b\n"
+ ".inst 0x6e8f944d // udot v13.4s, v2.16b, v15.16b\n"
+ ".inst 0x6e8f946e // udot v14.4s, v3.16b, v15.16b\n"
+ "111:" // Height 4: Multiply loop: unique 16: skip row sum
+ "ldr q5, [x11, #0x0]\n"
+ ".inst 0x6f80e0b0 // udot v16.4s, v5.16b, v0.4b[0]\n"
+ "ldr q6, [x11, #0x10]\n"
+ ".inst 0x6f81e0b4 // udot v20.4s, v5.16b, v1.4b[0]\n"
+ "ldr q7, [x11, #0x20]\n"
+ ".inst 0x6f82e0b8 // udot v24.4s, v5.16b, v2.4b[0]\n"
+ "ldr q8, [x11, #0x30]\n"
+ ".inst 0x6f83e0bc // udot v28.4s, v5.16b, v3.4b[0]\n"
+ "add x11, x11, #0x40\n"
+ ".inst 0x6f80e0d1 // udot v17.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0d5 // udot v21.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d9 // udot v25.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0dd // udot v29.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0f2 // udot v18.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0f6 // udot v22.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0fa // udot v26.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0fe // udot v30.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f80e113 // udot v19.4s, v8.16b, v0.4b[0]\n"
+ ".inst 0x6f81e117 // udot v23.4s, v8.16b, v1.4b[0]\n"
+ ".inst 0x6f82e11b // udot v27.4s, v8.16b, v2.4b[0]\n"
+ ".inst 0x6f83e11f // udot v31.4s, v8.16b, v3.4b[0]\n"
+ "112:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x28, x28, #0x1\n"
+ "cmp x28, x19\n"
+ "bne 98b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 113f\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "addp v11.4s, v11.4s, v11.4s\n"
+ "addp v12.4s, v12.4s, v12.4s\n"
+ "addp v13.4s, v13.4s, v13.4s\n"
+ "addp v14.4s, v14.4s, v14.4s\n"
+ "neg v4.4s, v4.4s\n"
+ "mul v11.4s, v11.4s, v4.4s\n"
+ "mul v12.4s, v12.4s, v4.4s\n"
+ "mul v13.4s, v13.4s, v4.4s\n"
+ "mul v14.4s, v14.4s, v4.4s\n"
+ "113:" // Height 4: skip row sum fixup
+ "add v16.4s, v16.4s, v11.4s\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add v17.4s, v17.4s, v11.4s\n"
+ "ldr q0, [x10, #0x0]\n"
+ "add v18.4s, v18.4s, v11.4s\n"
+ "ldr q1, [x10, #0x10]\n"
+ "add v19.4s, v19.4s, v11.4s\n"
+ "ldr q2, [x10, #0x20]\n"
+ "add v20.4s, v20.4s, v12.4s\n"
+ "ldr q3, [x10, #0x30]\n"
+ "add v21.4s, v21.4s, v12.4s\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add v22.4s, v22.4s, v12.4s\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "add v23.4s, v23.4s, v12.4s\n"
+ "add x10, x10, #0x40\n"
+ "add v24.4s, v24.4s, v13.4s\n"
+ "add v25.4s, v25.4s, v13.4s\n"
+ "add v26.4s, v26.4s, v13.4s\n"
+ "add v27.4s, v27.4s, v13.4s\n"
+ "add v28.4s, v28.4s, v14.4s\n"
+ "add v29.4s, v29.4s, v14.4s\n"
+ "add v30.4s, v30.4s, v14.4s\n"
+ "add v31.4s, v31.4s, v14.4s\n"
+ "add v16.4s, v16.4s, v0.4s\n"
+ "add v17.4s, v17.4s, v1.4s\n"
+ "add v18.4s, v18.4s, v2.4s\n"
+ "add v19.4s, v19.4s, v3.4s\n"
+ "add v20.4s, v20.4s, v0.4s\n"
+ "add v21.4s, v21.4s, v1.4s\n"
+ "add v22.4s, v22.4s, v2.4s\n"
+ "add v23.4s, v23.4s, v3.4s\n"
+ "add v24.4s, v24.4s, v0.4s\n"
+ "add v25.4s, v25.4s, v1.4s\n"
+ "add v26.4s, v26.4s, v2.4s\n"
+ "add v27.4s, v27.4s, v3.4s\n"
+ "add v28.4s, v28.4s, v0.4s\n"
+ "ld1r { v0.4s }, [x20]\n"
+ "add v29.4s, v29.4s, v1.4s\n"
+ "add v30.4s, v30.4s, v2.4s\n"
+ "add v31.4s, v31.4s, v3.4s\n"
+ "sqrdmulh v16.4s, v16.4s, v4.4s\n"
+ "sqrdmulh v17.4s, v17.4s, v4.4s\n"
+ "sqrdmulh v18.4s, v18.4s, v4.4s\n"
+ "sqrdmulh v19.4s, v19.4s, v4.4s\n"
+ "sqrdmulh v20.4s, v20.4s, v4.4s\n"
+ "sqrdmulh v21.4s, v21.4s, v4.4s\n"
+ "sqrdmulh v22.4s, v22.4s, v4.4s\n"
+ "sqrdmulh v23.4s, v23.4s, v4.4s\n"
+ "sqrdmulh v24.4s, v24.4s, v4.4s\n"
+ "sqrdmulh v25.4s, v25.4s, v4.4s\n"
+ "sqrdmulh v26.4s, v26.4s, v4.4s\n"
+ "sqrdmulh v27.4s, v27.4s, v4.4s\n"
+ "sqrdmulh v28.4s, v28.4s, v4.4s\n"
+ "sqrdmulh v29.4s, v29.4s, v4.4s\n"
+ "sqrdmulh v30.4s, v30.4s, v4.4s\n"
+ "sqrdmulh v31.4s, v31.4s, v4.4s\n"
+ "tbz %x[flags], #5, 114f\n"
+ "and v4.16b, v16.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v17.16b, v0.16b\n"
+ "and v6.16b, v18.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "and v7.16b, v19.16b, v0.16b\n"
+ "and v8.16b, v20.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "and v9.16b, v21.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "sqadd v16.4s, v16.4s, v4.4s\n"
+ "and v10.16b, v22.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "and v4.16b, v23.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "sqadd v17.4s, v17.4s, v5.4s\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v18.4s, v18.4s, v6.4s\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "and v5.16b, v24.16b, v0.16b\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v19.4s, v19.4s, v7.4s\n"
+ "sqadd v20.4s, v20.4s, v8.4s\n"
+ "sqadd v21.4s, v21.4s, v9.4s\n"
+ "sqadd v22.4s, v22.4s, v10.4s\n"
+ "sqadd v23.4s, v23.4s, v4.4s\n"
+ "and v6.16b, v25.16b, v0.16b\n"
+ "sshr v6.4s, v6.4s, #0x1f\n"
+ "sqadd v24.4s, v24.4s, v5.4s\n"
+ "and v7.16b, v26.16b, v0.16b\n"
+ "sshr v7.4s, v7.4s, #0x1f\n"
+ "and v8.16b, v27.16b, v0.16b\n"
+ "and v9.16b, v28.16b, v0.16b\n"
+ "sshr v8.4s, v8.4s, #0x1f\n"
+ "sqadd v25.4s, v25.4s, v6.4s\n"
+ "and v10.16b, v29.16b, v0.16b\n"
+ "sshr v9.4s, v9.4s, #0x1f\n"
+ "and v4.16b, v30.16b, v0.16b\n"
+ "sshr v10.4s, v10.4s, #0x1f\n"
+ "sqadd v26.4s, v26.4s, v7.4s\n"
+ "and v5.16b, v31.16b, v0.16b\n"
+ "sshr v4.4s, v4.4s, #0x1f\n"
+ "sqadd v27.4s, v27.4s, v8.4s\n"
+ "sshr v5.4s, v5.4s, #0x1f\n"
+ "sqadd v28.4s, v28.4s, v9.4s\n"
+ "sqadd v29.4s, v29.4s, v10.4s\n"
+ "sqadd v30.4s, v30.4s, v4.4s\n"
+ "sqadd v31.4s, v31.4s, v5.4s\n"
+ "114:" // Height 4: no shift correction
+ "srshl v16.4s, v16.4s, v0.4s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1r { v4.4s }, [x19]\n"
+ "srshl v17.4s, v17.4s, v0.4s\n"
+ "add x19, %x[qp], %[minval]\n"
+ "srshl v18.4s, v18.4s, v0.4s\n"
+ "ld1r { v5.4s }, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ "srshl v19.4s, v19.4s, v0.4s\n"
+ "ld1r { v6.4s }, [x19]\n"
+ "cmp x12, #0x10\n"
+ "srshl v20.4s, v20.4s, v0.4s\n"
+ "srshl v21.4s, v21.4s, v0.4s\n"
+ "srshl v22.4s, v22.4s, v0.4s\n"
+ "srshl v23.4s, v23.4s, v0.4s\n"
+ "add v16.4s, v16.4s, v4.4s\n"
+ "add v17.4s, v17.4s, v4.4s\n"
+ "add v18.4s, v18.4s, v4.4s\n"
+ "smin v16.4s, v16.4s, v6.4s\n"
+ "smin v17.4s, v17.4s, v6.4s\n"
+ "smin v18.4s, v18.4s, v6.4s\n"
+ "smax v16.4s, v16.4s, v5.4s\n"
+ "smax v17.4s, v17.4s, v5.4s\n"
+ "smax v18.4s, v18.4s, v5.4s\n"
+ "add v19.4s, v19.4s, v4.4s\n"
+ "add v20.4s, v20.4s, v4.4s\n"
+ "add v21.4s, v21.4s, v4.4s\n"
+ "smin v19.4s, v19.4s, v6.4s\n"
+ "smin v20.4s, v20.4s, v6.4s\n"
+ "smin v21.4s, v21.4s, v6.4s\n"
+ "smax v19.4s, v19.4s, v5.4s\n"
+ "smax v20.4s, v20.4s, v5.4s\n"
+ "smax v21.4s, v21.4s, v5.4s\n"
+ "add v22.4s, v22.4s, v4.4s\n"
+ "add v23.4s, v23.4s, v4.4s\n"
+ "srshl v24.4s, v24.4s, v0.4s\n"
+ "smin v22.4s, v22.4s, v6.4s\n"
+ "smin v23.4s, v23.4s, v6.4s\n"
+ "srshl v25.4s, v25.4s, v0.4s\n"
+ "smax v22.4s, v22.4s, v5.4s\n"
+ "smax v23.4s, v23.4s, v5.4s\n"
+ "add v24.4s, v24.4s, v4.4s\n"
+ "add v25.4s, v25.4s, v4.4s\n"
+ "srshl v26.4s, v26.4s, v0.4s\n"
+ "smin v24.4s, v24.4s, v6.4s\n"
+ "smin v25.4s, v25.4s, v6.4s\n"
+ "srshl v27.4s, v27.4s, v0.4s\n"
+ "smax v24.4s, v24.4s, v5.4s\n"
+ "smax v25.4s, v25.4s, v5.4s\n"
+ "add v26.4s, v26.4s, v4.4s\n"
+ "add v27.4s, v27.4s, v4.4s\n"
+ "srshl v28.4s, v28.4s, v0.4s\n"
+ "smin v26.4s, v26.4s, v6.4s\n"
+ "smin v27.4s, v27.4s, v6.4s\n"
+ "srshl v29.4s, v29.4s, v0.4s\n"
+ "smax v26.4s, v26.4s, v5.4s\n"
+ "smax v27.4s, v27.4s, v5.4s\n"
+ "add v28.4s, v28.4s, v4.4s\n"
+ "add v29.4s, v29.4s, v4.4s\n"
+ "srshl v30.4s, v30.4s, v0.4s\n"
+ "smin v28.4s, v28.4s, v6.4s\n"
+ "smin v29.4s, v29.4s, v6.4s\n"
+ "srshl v31.4s, v31.4s, v0.4s\n"
+ "smax v28.4s, v28.4s, v5.4s\n"
+ "smax v29.4s, v29.4s, v5.4s\n"
+ "add v30.4s, v30.4s, v4.4s\n"
+ "add v31.4s, v31.4s, v4.4s\n"
+ "uzp1 v16.8h, v16.8h, v17.8h\n"
+ "smin v30.4s, v30.4s, v6.4s\n"
+ "smin v31.4s, v31.4s, v6.4s\n"
+ "uzp1 v17.8h, v18.8h, v19.8h\n"
+ "smax v30.4s, v30.4s, v5.4s\n"
+ "smax v31.4s, v31.4s, v5.4s\n"
+ "uzp1 v20.8h, v20.8h, v21.8h\n"
+ "uzp1 v21.8h, v22.8h, v23.8h\n"
+ "uzp1 v24.8h, v24.8h, v25.8h\n"
+ "uzp1 v25.8h, v26.8h, v27.8h\n"
+ "uzp1 v28.8h, v28.8h, v29.8h\n"
+ "uzp1 v29.8h, v30.8h, v31.8h\n"
+ "uzp1 v16.16b, v16.16b, v17.16b\n"
+ "uzp1 v20.16b, v20.16b, v21.16b\n"
+ "uzp1 v24.16b, v24.16b, v25.16b\n"
+ "uzp1 v28.16b, v28.16b, v29.16b\n"
+ "bge 123f\n"
+ "tbz x12, #3, 118f\n"
+ "str d16, [x9], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x12, #2, 116f\n"
+ "st1 { v16.s }[2], [x9], #0x4\n"
+ "st1 { v20.s }[2], [x25], #0x4\n"
+ "st1 { v24.s }[2], [x23], #0x4\n"
+ "st1 { v28.s }[2], [x21], #0x4\n"
+ "tbz x12, #1, 115f\n"
+ "st1 { v16.h }[6], [x9], #0x2\n"
+ "st1 { v20.h }[6], [x25], #0x2\n"
+ "st1 { v24.h }[6], [x23], #0x2\n"
+ "st1 { v28.h }[6], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[14], [x9]\n"
+ "st1 { v20.b }[14], [x25]\n"
+ "st1 { v24.b }[14], [x23]\n"
+ "st1 { v28.b }[14], [x21]\n"
+ "b 122f\n"
+ "115:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[12], [x9]\n"
+ "st1 { v20.b }[12], [x25]\n"
+ "st1 { v24.b }[12], [x23]\n"
+ "st1 { v28.b }[12], [x21]\n"
+ "b 122f\n"
+ "116:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x12, #1, 117f\n"
+ "st1 { v16.h }[4], [x9], #0x2\n"
+ "st1 { v20.h }[4], [x25], #0x2\n"
+ "st1 { v24.h }[4], [x23], #0x2\n"
+ "st1 { v28.h }[4], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[10], [x9]\n"
+ "st1 { v20.b }[10], [x25]\n"
+ "st1 { v24.b }[10], [x23]\n"
+ "st1 { v28.b }[10], [x21]\n"
+ "b 122f\n"
+ "117:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[8], [x9]\n"
+ "st1 { v20.b }[8], [x25]\n"
+ "st1 { v24.b }[8], [x23]\n"
+ "st1 { v28.b }[8], [x21]\n"
+ "b 122f\n"
+ "118:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x12, #2, 120f\n"
+ "str s16, [x9], #0x4\n"
+ "str s20, [x25], #0x4\n"
+ "str s24, [x23], #0x4\n"
+ "str s28, [x21], #0x4\n"
+ "tbz x12, #1, 119f\n"
+ "st1 { v16.h }[2], [x9], #0x2\n"
+ "st1 { v20.h }[2], [x25], #0x2\n"
+ "st1 { v24.h }[2], [x23], #0x2\n"
+ "st1 { v28.h }[2], [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[6], [x9]\n"
+ "st1 { v20.b }[6], [x25]\n"
+ "st1 { v24.b }[6], [x23]\n"
+ "st1 { v28.b }[6], [x21]\n"
+ "b 122f\n"
+ "119:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[4], [x9]\n"
+ "st1 { v20.b }[4], [x25]\n"
+ "st1 { v24.b }[4], [x23]\n"
+ "st1 { v28.b }[4], [x21]\n"
+ "b 122f\n"
+ "120:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x12, #1, 121f\n"
+ "str h16, [x9], #0x2\n"
+ "str h20, [x25], #0x2\n"
+ "str h24, [x23], #0x2\n"
+ "str h28, [x21], #0x2\n"
+ "tbz x12, #0, 122f\n"
+ "st1 { v16.b }[2], [x9]\n"
+ "st1 { v20.b }[2], [x25]\n"
+ "st1 { v24.b }[2], [x23]\n"
+ "st1 { v28.b }[2], [x21]\n"
+ "b 122f\n"
+ "121:" // Height 4: Partial direct writeback: partial_1_0
+ "str b16, [x9, #0x0]\n"
+ "str b20, [x25, #0x0]\n"
+ "str b24, [x23, #0x0]\n"
+ "str b28, [x21, #0x0]\n"
+ "122:" // Height 4: Partial direct writeback: Done
+ "b 124f\n"
+ "123:" // Height 4: Full writeback
+ "str q16, [x9, #0x0]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q28, [x21, #0x0]\n"
+ "add x9, x9, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x21, x21, #0x10\n"
+ "124:" // Height 4: Writeback done
+ "subs x12, x12, #0x10\n"
+ "bgt 96b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 126f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 125f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "125:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "126:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
deleted file mode 100644
index 735e5fd45a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/a55.cpp
+++ /dev/null
@@ -1,2434 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const uint8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(uint8_t);
-
- uint32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const uint8_t *a_ptr0 = a_ptr0_base;
- const uint8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- uint32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
- uint32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "temploadreg0 .req X0\n"
- "temploadreg1 .req X1\n"
- "temploadreg2 .req X2\n"
- "temploadreg3 .req X3\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ins v4.d[1], temploadreg0\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- "ins v13.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "ins v12.d[1], temploadreg0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "temploadreg0 .req X2\n"
- "temploadreg1 .req X3\n"
- "temploadreg2 .req X4\n"
- "temploadreg3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- "ins v15.d[1], temploadreg3\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v1.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "b.ne 3b\n"
- "2:\n"
- "ins v14.d[1], temploadreg2\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "temploadreg0 .req X4\n"
- "temploadreg1 .req X5\n"
- "temploadreg2 .req X6\n"
- "temploadreg3 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ins v14.d[1], temploadreg2\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "ins v15.d[1], temploadreg3\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- "ins v8.d[1], temploadreg0\n"
- "ins v2.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- "ins v9.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- "ins v10.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- "ins v11.d[1], temploadreg3\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "temploadreg0 .req X6\n"
- "temploadreg1 .req X7\n"
- "temploadreg2 .req X8\n"
- "temploadreg3 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "movi v28.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v29.4s, #0\n"
- "ins v14.d[1], temploadreg2\n"
- "movi v30.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v31.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v14.d[1], temploadreg2\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr d0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #-0x8]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v0.d[1], temploadreg0\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr d1, [a_ptr1, #-0x10]\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #-0x8]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v1.d[1], temploadreg1\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr d2, [a_ptr2, #-0x10]\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr temploadreg2, [a_ptr2, #-0x8]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v2.d[1], temploadreg2\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr d3, [a_ptr3, #-0x10]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr temploadreg3, [a_ptr3, #-0x8]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ins v3.d[1], temploadreg3\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- "ins v13.d[1], temploadreg1\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- "ins v14.d[1], temploadreg2\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "ins v15.d[1], temploadreg3\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr d4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg0, [%[a_ptr0], #0x8]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr d5, [a_ptr1]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr temploadreg1, [a_ptr1, #0x8]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr d6, [a_ptr2]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr temploadreg2, [a_ptr2, #0x8]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr d7, [a_ptr3]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr temploadreg3, [a_ptr3, #0x8]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ins v4.d[1], temploadreg0\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ins v5.d[1], temploadreg1\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ins v6.d[1], temploadreg2\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v7.d[1], temploadreg3\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr d8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x78]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr d9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x68]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x58]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x48]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr d10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #-0x38]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr d11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #-0x28]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #-0x18]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr d12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #-0x8]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr d13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr d14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr d15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr d8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ins v8.d[1], temploadreg0\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr d9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ins v9.d[1], temploadreg1\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr d10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ins v10.d[1], temploadreg2\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr d11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ins v11.d[1], temploadreg3\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr d12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ins v12.d[1], temploadreg0\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr d13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ins v13.d[1], temploadreg1\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr d14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ins v14.d[1], temploadreg2\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr d15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ins v15.d[1], temploadreg3\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq temploadreg0\n"
- ".unreq temploadreg1\n"
- ".unreq temploadreg2\n"
- ".unreq temploadreg3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
deleted file mode 100644
index 2e86233a06..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_16x4/generic.cpp
+++ /dev/null
@@ -1,1808 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void a64_hybrid_u8u32_dot_16x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long blocks_count = K / 4;
- const long odds_count = K - (blocks_count * 4);
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const uint8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(uint8_t);
-
- uint32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=16ul) {
- const long width = std::min((unsigned long)N-x0, 16ul);
- long loops = loops_count;
- long regs = regs_count;
- long blocks = blocks_count;
- long odds = odds_count;
- const uint8_t *a_ptr0 = a_ptr0_base;
- const uint8_t *b_ptr0 = B + (K_stride * x0);
- const bool use_result_buffer = (width < 16);
- uint32_t result_buffer[64];
- const unsigned long ldcb = (use_result_buffer ? 16 : ldc) * sizeof(uint32_t);
- uint32_t *c_ptr_real = c_ptr0;
- if (use_result_buffer && accumulate) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- result_buffer[cy * 16 + cx] = c_ptr_real[cy * ldc + cx];
- }
- }
- }
- if (use_result_buffer) {
- c_ptr0 = result_buffer;
- }
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v18.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v19.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v19.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v20.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v21.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v22.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v23.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v20.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v21.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v22.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v23.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v24.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v25.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v26.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "cbnz %[accumulate], 1f\n"
- "movi v16.4s, #0\n"
- "ldr q0, [%[a_ptr0]]\n"
- "movi v17.4s, #0\n"
- "ldr q1, [a_ptr1]\n"
- "movi v18.4s, #0\n"
- "ldr q2, [a_ptr2]\n"
- "movi v19.4s, #0\n"
- "ldr q3, [a_ptr3]\n"
- "movi v20.4s, #0\n"
- "ldr q8, [%[b_ptr0]]\n"
- "movi v21.4s, #0\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "movi v22.4s, #0\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "movi v23.4s, #0\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "movi v24.4s, #0\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "movi v25.4s, #0\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "movi v26.4s, #0\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "movi v27.4s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "movi v28.4s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "movi v29.4s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "movi v30.4s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "movi v31.4s, #0\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ldr q16, [%[c_ptr0]]\n"
- "ldr q17, [%[c_ptr0], #0x10]\n"
- "ldr q18, [%[c_ptr0], #0x20]\n"
- "ldr q19, [%[c_ptr0], #0x30]\n"
- "ldr q20, [c_ptr1]\n"
- "ldr q21, [c_ptr1, #0x10]\n"
- "ldr q22, [c_ptr1, #0x20]\n"
- "ldr q23, [c_ptr1, #0x30]\n"
- "ldr q24, [c_ptr2]\n"
- "ldr q25, [c_ptr2, #0x10]\n"
- "ldr q26, [c_ptr2, #0x20]\n"
- "ldr q27, [c_ptr2, #0x30]\n"
- "ldr q28, [c_ptr3]\n"
- "ldr q29, [c_ptr3, #0x10]\n"
- "ldr q30, [c_ptr3, #0x20]\n"
- "ldr q31, [c_ptr3, #0x30]\n"
- "ldr q0, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ldr q1, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ldr q2, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ldr q3, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [%[a_ptr0], #0x40]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr1, #0x40]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr2, #0x40]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr3, #0x40]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- "ldr q0, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- "ldr q1, [a_ptr1, #-0x10]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- "ldr q2, [a_ptr2, #-0x10]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- "ldr q3, [a_ptr3, #-0x10]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- "prfm PSTL1KEEP, [%[c_ptr0]]\n"
- "prfm PSTL1KEEP, [c_ptr1]\n"
- "prfm PSTL1KEEP, [c_ptr2]\n"
- "prfm PSTL1KEEP, [c_ptr3]\n"
- "cbz %[regs], 4f\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr q4, [%[a_ptr0]]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "ldr q5, [a_ptr1]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "ldr q6, [a_ptr2]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q7, [a_ptr3]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x100\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- "ldr q8, [%[b_ptr0], #-0x80]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- "ldr q9, [%[b_ptr0], #-0x70]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- "ldr q10, [%[b_ptr0], #-0x60]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- "ldr q11, [%[b_ptr0], #-0x50]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- "ldr q12, [%[b_ptr0], #-0x40]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- "ldr q13, [%[b_ptr0], #-0x30]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- "ldr q14, [%[b_ptr0], #-0x20]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "ldr q15, [%[b_ptr0], #-0x10]\n"
- ".inst 0x6f84e110 // udot v16.4s, v8.16b, v4.4b[0]\n"
- ".inst 0x6f85e114 // udot v20.4s, v8.16b, v5.4b[0]\n"
- ".inst 0x6f86e118 // udot v24.4s, v8.16b, v6.4b[0]\n"
- ".inst 0x6f87e11c // udot v28.4s, v8.16b, v7.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f84e131 // udot v17.4s, v9.16b, v4.4b[0]\n"
- ".inst 0x6f85e135 // udot v21.4s, v9.16b, v5.4b[0]\n"
- ".inst 0x6f86e139 // udot v25.4s, v9.16b, v6.4b[0]\n"
- ".inst 0x6f87e13d // udot v29.4s, v9.16b, v7.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f84e152 // udot v18.4s, v10.16b, v4.4b[0]\n"
- ".inst 0x6f85e156 // udot v22.4s, v10.16b, v5.4b[0]\n"
- ".inst 0x6f86e15a // udot v26.4s, v10.16b, v6.4b[0]\n"
- ".inst 0x6f87e15e // udot v30.4s, v10.16b, v7.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f84e173 // udot v19.4s, v11.16b, v4.4b[0]\n"
- ".inst 0x6f85e177 // udot v23.4s, v11.16b, v5.4b[0]\n"
- ".inst 0x6f86e17b // udot v27.4s, v11.16b, v6.4b[0]\n"
- ".inst 0x6f87e17f // udot v31.4s, v11.16b, v7.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa4e190 // udot v16.4s, v12.16b, v4.4b[1]\n"
- ".inst 0x6fa5e194 // udot v20.4s, v12.16b, v5.4b[1]\n"
- ".inst 0x6fa6e198 // udot v24.4s, v12.16b, v6.4b[1]\n"
- ".inst 0x6fa7e19c // udot v28.4s, v12.16b, v7.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa4e1b1 // udot v17.4s, v13.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1b5 // udot v21.4s, v13.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1b9 // udot v25.4s, v13.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1bd // udot v29.4s, v13.16b, v7.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa4e1d2 // udot v18.4s, v14.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1d6 // udot v22.4s, v14.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1da // udot v26.4s, v14.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1de // udot v30.4s, v14.16b, v7.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa4e1f3 // udot v19.4s, v15.16b, v4.4b[1]\n"
- ".inst 0x6fa5e1f7 // udot v23.4s, v15.16b, v5.4b[1]\n"
- ".inst 0x6fa6e1fb // udot v27.4s, v15.16b, v6.4b[1]\n"
- ".inst 0x6fa7e1ff // udot v31.4s, v15.16b, v7.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f84e910 // udot v16.4s, v8.16b, v4.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f85e914 // udot v20.4s, v8.16b, v5.4b[2]\n"
- ".inst 0x6f86e918 // udot v24.4s, v8.16b, v6.4b[2]\n"
- ".inst 0x6f87e91c // udot v28.4s, v8.16b, v7.4b[2]\n"
- ".inst 0x6f84e931 // udot v17.4s, v9.16b, v4.4b[2]\n"
- ".inst 0x6f85e935 // udot v21.4s, v9.16b, v5.4b[2]\n"
- ".inst 0x6f86e939 // udot v25.4s, v9.16b, v6.4b[2]\n"
- ".inst 0x6f87e93d // udot v29.4s, v9.16b, v7.4b[2]\n"
- ".inst 0x6f84e952 // udot v18.4s, v10.16b, v4.4b[2]\n"
- ".inst 0x6f85e956 // udot v22.4s, v10.16b, v5.4b[2]\n"
- ".inst 0x6f86e95a // udot v26.4s, v10.16b, v6.4b[2]\n"
- ".inst 0x6f87e95e // udot v30.4s, v10.16b, v7.4b[2]\n"
- ".inst 0x6f84e973 // udot v19.4s, v11.16b, v4.4b[2]\n"
- ".inst 0x6f85e977 // udot v23.4s, v11.16b, v5.4b[2]\n"
- ".inst 0x6f86e97b // udot v27.4s, v11.16b, v6.4b[2]\n"
- ".inst 0x6f87e97f // udot v31.4s, v11.16b, v7.4b[2]\n"
- ".inst 0x6fa4e990 // udot v16.4s, v12.16b, v4.4b[3]\n"
- ".inst 0x6fa5e994 // udot v20.4s, v12.16b, v5.4b[3]\n"
- ".inst 0x6fa6e998 // udot v24.4s, v12.16b, v6.4b[3]\n"
- ".inst 0x6fa7e99c // udot v28.4s, v12.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9b1 // udot v17.4s, v13.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9b5 // udot v21.4s, v13.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9b9 // udot v25.4s, v13.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9bd // udot v29.4s, v13.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9d2 // udot v18.4s, v14.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9d6 // udot v22.4s, v14.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9da // udot v26.4s, v14.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9de // udot v30.4s, v14.16b, v7.4b[3]\n"
- ".inst 0x6fa4e9f3 // udot v19.4s, v15.16b, v4.4b[3]\n"
- ".inst 0x6fa5e9f7 // udot v23.4s, v15.16b, v5.4b[3]\n"
- ".inst 0x6fa6e9fb // udot v27.4s, v15.16b, v6.4b[3]\n"
- ".inst 0x6fa7e9ff // udot v31.4s, v15.16b, v7.4b[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- "ldr q8, [%[b_ptr0]]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6fa0e190 // udot v16.4s, v12.16b, v0.4b[1]\n"
- ".inst 0x6fa1e194 // udot v20.4s, v12.16b, v1.4b[1]\n"
- ".inst 0x6fa2e198 // udot v24.4s, v12.16b, v2.4b[1]\n"
- ".inst 0x6fa3e19c // udot v28.4s, v12.16b, v3.4b[1]\n"
- "ldr q12, [%[b_ptr0], #0x40]\n"
- ".inst 0x6fa0e1b1 // udot v17.4s, v13.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1b5 // udot v21.4s, v13.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1b9 // udot v25.4s, v13.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1bd // udot v29.4s, v13.16b, v3.4b[1]\n"
- "ldr q13, [%[b_ptr0], #0x50]\n"
- ".inst 0x6fa0e1d2 // udot v18.4s, v14.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1d6 // udot v22.4s, v14.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1da // udot v26.4s, v14.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1de // udot v30.4s, v14.16b, v3.4b[1]\n"
- "ldr q14, [%[b_ptr0], #0x60]\n"
- ".inst 0x6fa0e1f3 // udot v19.4s, v15.16b, v0.4b[1]\n"
- ".inst 0x6fa1e1f7 // udot v23.4s, v15.16b, v1.4b[1]\n"
- ".inst 0x6fa2e1fb // udot v27.4s, v15.16b, v2.4b[1]\n"
- ".inst 0x6fa3e1ff // udot v31.4s, v15.16b, v3.4b[1]\n"
- "ldr q15, [%[b_ptr0], #0x70]\n"
- ".inst 0x6f80e910 // udot v16.4s, v8.16b, v0.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- ".inst 0x6f81e914 // udot v20.4s, v8.16b, v1.4b[2]\n"
- ".inst 0x6f82e918 // udot v24.4s, v8.16b, v2.4b[2]\n"
- ".inst 0x6f83e91c // udot v28.4s, v8.16b, v3.4b[2]\n"
- ".inst 0x6f80e931 // udot v17.4s, v9.16b, v0.4b[2]\n"
- ".inst 0x6f81e935 // udot v21.4s, v9.16b, v1.4b[2]\n"
- ".inst 0x6f82e939 // udot v25.4s, v9.16b, v2.4b[2]\n"
- ".inst 0x6f83e93d // udot v29.4s, v9.16b, v3.4b[2]\n"
- ".inst 0x6f80e952 // udot v18.4s, v10.16b, v0.4b[2]\n"
- ".inst 0x6f81e956 // udot v22.4s, v10.16b, v1.4b[2]\n"
- ".inst 0x6f82e95a // udot v26.4s, v10.16b, v2.4b[2]\n"
- ".inst 0x6f83e95e // udot v30.4s, v10.16b, v3.4b[2]\n"
- ".inst 0x6f80e973 // udot v19.4s, v11.16b, v0.4b[2]\n"
- ".inst 0x6f81e977 // udot v23.4s, v11.16b, v1.4b[2]\n"
- ".inst 0x6f82e97b // udot v27.4s, v11.16b, v2.4b[2]\n"
- ".inst 0x6f83e97f // udot v31.4s, v11.16b, v3.4b[2]\n"
- ".inst 0x6fa0e990 // udot v16.4s, v12.16b, v0.4b[3]\n"
- ".inst 0x6fa1e994 // udot v20.4s, v12.16b, v1.4b[3]\n"
- ".inst 0x6fa2e998 // udot v24.4s, v12.16b, v2.4b[3]\n"
- ".inst 0x6fa3e99c // udot v28.4s, v12.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9b1 // udot v17.4s, v13.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9b5 // udot v21.4s, v13.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9b9 // udot v25.4s, v13.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9bd // udot v29.4s, v13.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9d2 // udot v18.4s, v14.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9d6 // udot v22.4s, v14.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9da // udot v26.4s, v14.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9de // udot v30.4s, v14.16b, v3.4b[3]\n"
- ".inst 0x6fa0e9f3 // udot v19.4s, v15.16b, v0.4b[3]\n"
- ".inst 0x6fa1e9f7 // udot v23.4s, v15.16b, v1.4b[3]\n"
- ".inst 0x6fa2e9fb // udot v27.4s, v15.16b, v2.4b[3]\n"
- ".inst 0x6fa3e9ff // udot v31.4s, v15.16b, v3.4b[3]\n"
- "5:\n"
- "cbz %[blocks], 6f\n"
- "7:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr s0, [%[a_ptr0]]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "add %[a_ptr0], %[a_ptr0], #0x4\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- "ldr s1, [a_ptr1]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- "add a_ptr1, a_ptr1, #0x4\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- "ldr s2, [a_ptr2]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- "add a_ptr2, a_ptr2, #0x4\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- "ldr s3, [a_ptr3]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- "add a_ptr3, a_ptr3, #0x4\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "b.ne 7b\n"
- "6:\n"
- "cbz %[odds], 8f\n"
- "ld1 {v0.b}[0], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[0], [a_ptr1], #1\n"
- "ld1 {v2.b}[0], [a_ptr2], #1\n"
- "ld1 {v3.b}[0], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[1], [%[a_ptr0]], #1\n"
- "ld1 {v1.b}[1], [a_ptr1], #1\n"
- "ld1 {v2.b}[1], [a_ptr2], #1\n"
- "ld1 {v3.b}[1], [a_ptr3], #1\n"
- "subs %[odds], %[odds], #0x1\n"
- "b.eq 9f\n"
- "ld1 {v0.b}[2], [%[a_ptr0]]\n"
- "ld1 {v1.b}[2], [a_ptr1]\n"
- "ld1 {v2.b}[2], [a_ptr2]\n"
- "ld1 {v3.b}[2], [a_ptr3]\n"
- "9:\n"
- "ldr q8, [%[b_ptr0]]\n"
- "ldr q9, [%[b_ptr0], #0x10]\n"
- "ldr q10, [%[b_ptr0], #0x20]\n"
- "ldr q11, [%[b_ptr0], #0x30]\n"
- ".inst 0x6f80e110 // udot v16.4s, v8.16b, v0.4b[0]\n"
- ".inst 0x6f81e114 // udot v20.4s, v8.16b, v1.4b[0]\n"
- ".inst 0x6f82e118 // udot v24.4s, v8.16b, v2.4b[0]\n"
- ".inst 0x6f83e11c // udot v28.4s, v8.16b, v3.4b[0]\n"
- ".inst 0x6f80e131 // udot v17.4s, v9.16b, v0.4b[0]\n"
- ".inst 0x6f81e135 // udot v21.4s, v9.16b, v1.4b[0]\n"
- ".inst 0x6f82e139 // udot v25.4s, v9.16b, v2.4b[0]\n"
- ".inst 0x6f83e13d // udot v29.4s, v9.16b, v3.4b[0]\n"
- ".inst 0x6f80e152 // udot v18.4s, v10.16b, v0.4b[0]\n"
- ".inst 0x6f81e156 // udot v22.4s, v10.16b, v1.4b[0]\n"
- ".inst 0x6f82e15a // udot v26.4s, v10.16b, v2.4b[0]\n"
- ".inst 0x6f83e15e // udot v30.4s, v10.16b, v3.4b[0]\n"
- ".inst 0x6f80e173 // udot v19.4s, v11.16b, v0.4b[0]\n"
- ".inst 0x6f81e177 // udot v23.4s, v11.16b, v1.4b[0]\n"
- ".inst 0x6f82e17b // udot v27.4s, v11.16b, v2.4b[0]\n"
- ".inst 0x6f83e17f // udot v31.4s, v11.16b, v3.4b[0]\n"
- "8:\n"
- "str q16, [%[c_ptr0]]\n"
- "str q17, [%[c_ptr0], #0x10]\n"
- "str q18, [%[c_ptr0], #0x20]\n"
- "str q19, [%[c_ptr0], #0x30]\n"
- "add %[c_ptr0], %[c_ptr0], #0x40\n"
- "str q20, [c_ptr1]\n"
- "str q21, [c_ptr1, #0x10]\n"
- "str q22, [c_ptr1, #0x20]\n"
- "str q23, [c_ptr1, #0x30]\n"
- "str q24, [c_ptr2]\n"
- "str q25, [c_ptr2, #0x10]\n"
- "str q26, [c_ptr2, #0x20]\n"
- "str q27, [c_ptr2, #0x30]\n"
- "str q28, [c_ptr3]\n"
- "str q29, [c_ptr3, #0x10]\n"
- "str q30, [c_ptr3, #0x20]\n"
- "str q31, [c_ptr3, #0x30]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [blocks] "+r" (blocks), [odds] "+r" (odds)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
- if (use_result_buffer) {
- for(int cy=0; cy<std::min(M-y, 4); cy++) {
- for(unsigned int cx=0; cx<width; cx++) {
- c_ptr_real[cy * ldc + cx] = result_buffer[cy * 16 + cx];
- }
- }
- }
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
new file mode 100644
index 0000000000..238c1825f3
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __aarch64__
+
+#include "../std_transforms_fixed.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint32_t>, \
+ const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void a64_hybrid_u8u32_dot_6x16( ARGLIST );
+
+class cls_a64_hybrid_u8u32_dot_6x16
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return 16;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsFixed<operand_type, result_type, 6, 16, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=a64_hybrid_u8u32_dot_6x16;
+
+ cls_a64_hybrid_u8u32_dot_6x16(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
new file mode 100644
index 0000000000..3c8654147a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_hybrid_u8u32_dot_6x16/generic.cpp
@@ -0,0 +1,3335 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void a64_hybrid_u8u32_dot_6x16 (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+ const uint32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 176f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 141f\n"
+ "beq 106f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 71f\n"
+ "beq 36f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "tbz %x[flags], #0, 13f\n"
+ "cmp x15, #0x10\n"
+ "bge 12f\n"
+ "tbz x15, #3, 7f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 5f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 4f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "b 11f\n"
+ "4:" // Height 1: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "b 11f\n"
+ "5:" // Height 1: Partial accumulate: partial_2_8
+ "tbz x15, #1, 6f\n"
+ "ldr d10, [x13], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "b 11f\n"
+ "6:" // Height 1: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "b 11f\n"
+ "7:" // Height 1: Partial accumulate: partial_4_0
+ "tbz x15, #2, 9f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 8f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "b 11f\n"
+ "8:" // Height 1: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 11f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "b 11f\n"
+ "9:" // Height 1: Partial accumulate: partial_2_0
+ "tbz x15, #1, 10f\n"
+ "ldr d8, [x13], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 11f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "b 11f\n"
+ "10:" // Height 1: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "11:" // Height 1: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "b 14f\n"
+ "12:" // Height 1: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "b 14f\n"
+ "13:" // Height 1: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "14:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "15:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 16f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 17f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 17f\n"
+ "16:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "17:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "blt 20f\n"
+ "cmp x11, #0x20\n"
+ "blt 19f\n"
+ "18:" // Height 1: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "bge 18b\n"
+ "19:" // Height 1: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ "20:" // Height 1: Multiply loop: Main loop skip
+ "cbz x11, 25f\n"
+ "cmp x11, #0x4\n"
+ "blt 22f\n"
+ "21:" // Height 1: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ "bge 21b\n"
+ "cbz x11, 25f\n"
+ "22:" // Height 1: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 23f\n"
+ "ldr h0, [x10], #0x2\n"
+ "tbz x11, #0, 24f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "b 24f\n"
+ "23:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "24:" // Height 1: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ "25:" // Height 1: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 15b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 34f\n"
+ "tbz x15, #3, 29f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "tbz x15, #2, 27f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 26f\n"
+ "str d11, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "b 33f\n"
+ "26:" // Height 1: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 33f\n"
+ "str s11, [x13, #0x0]\n"
+ "b 33f\n"
+ "27:" // Height 1: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 28f\n"
+ "str d10, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "b 33f\n"
+ "28:" // Height 1: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 33f\n"
+ "str s10, [x13, #0x0]\n"
+ "b 33f\n"
+ "29:" // Height 1: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 31f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "tbz x15, #1, 30f\n"
+ "str d9, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "b 33f\n"
+ "30:" // Height 1: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 33f\n"
+ "str s9, [x13, #0x0]\n"
+ "b 33f\n"
+ "31:" // Height 1: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 32f\n"
+ "str d8, [x13], #0x8\n"
+ "tbz x15, #0, 33f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "b 33f\n"
+ "32:" // Height 1: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "33:" // Height 1: Partial direct writeback: Done
+ "b 35f\n"
+ "34:" // Height 1: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "35:" // Height 1: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 3b\n"
+ "b 212f\n"
+ "36:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 37f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 38f\n"
+ "37:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "38:" // Height 2: Column loop
+ "tbz %x[flags], #0, 48f\n"
+ "cmp x15, #0x10\n"
+ "bge 47f\n"
+ "tbz x15, #3, 42f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 40f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 39f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "b 46f\n"
+ "39:" // Height 2: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "b 46f\n"
+ "40:" // Height 2: Partial accumulate: partial_2_8
+ "tbz x15, #1, 41f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "b 46f\n"
+ "41:" // Height 2: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "b 46f\n"
+ "42:" // Height 2: Partial accumulate: partial_4_0
+ "tbz x15, #2, 44f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 43f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "b 46f\n"
+ "43:" // Height 2: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 46f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "b 46f\n"
+ "44:" // Height 2: Partial accumulate: partial_2_0
+ "tbz x15, #1, 45f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 46f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "b 46f\n"
+ "45:" // Height 2: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "46:" // Height 2: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "b 49f\n"
+ "47:" // Height 2: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "b 49f\n"
+ "48:" // Height 2: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "49:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "50:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 52f\n"
+ "51:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "52:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "blt 55f\n"
+ "cmp x11, #0x20\n"
+ "blt 54f\n"
+ "53:" // Height 2: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "bge 53b\n"
+ "54:" // Height 2: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ "55:" // Height 2: Multiply loop: Main loop skip
+ "cbz x11, 60f\n"
+ "cmp x11, #0x4\n"
+ "blt 57f\n"
+ "56:" // Height 2: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "bge 56b\n"
+ "cbz x11, 60f\n"
+ "57:" // Height 2: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 58f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "tbz x11, #0, 59f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "b 59f\n"
+ "58:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "59:" // Height 2: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ "60:" // Height 2: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "bge 69f\n"
+ "tbz x15, #3, 64f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "tbz x15, #2, 62f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 61f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "b 68f\n"
+ "61:" // Height 2: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 68f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "b 68f\n"
+ "62:" // Height 2: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 63f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "b 68f\n"
+ "63:" // Height 2: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 68f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "b 68f\n"
+ "64:" // Height 2: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 66f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "tbz x15, #1, 65f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "b 68f\n"
+ "65:" // Height 2: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 68f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "b 68f\n"
+ "66:" // Height 2: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 67f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "tbz x15, #0, 68f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "b 68f\n"
+ "67:" // Height 2: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "68:" // Height 2: Partial direct writeback: Done
+ "b 70f\n"
+ "69:" // Height 2: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "70:" // Height 2: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 38b\n"
+ "b 212f\n"
+ "71:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "73:" // Height 3: Column loop
+ "tbz %x[flags], #0, 83f\n"
+ "cmp x15, #0x10\n"
+ "bge 82f\n"
+ "tbz x15, #3, 77f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 75f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 74f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "b 81f\n"
+ "74:" // Height 3: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "b 81f\n"
+ "75:" // Height 3: Partial accumulate: partial_2_8
+ "tbz x15, #1, 76f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "b 81f\n"
+ "76:" // Height 3: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "b 81f\n"
+ "77:" // Height 3: Partial accumulate: partial_4_0
+ "tbz x15, #2, 79f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 78f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "b 81f\n"
+ "78:" // Height 3: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 81f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "b 81f\n"
+ "79:" // Height 3: Partial accumulate: partial_2_0
+ "tbz x15, #1, 80f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 81f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "b 81f\n"
+ "80:" // Height 3: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "81:" // Height 3: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "b 84f\n"
+ "82:" // Height 3: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "b 84f\n"
+ "83:" // Height 3: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "84:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "85:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 86f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 87f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 87f\n"
+ "86:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "87:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "blt 90f\n"
+ "cmp x11, #0x20\n"
+ "blt 89f\n"
+ "88:" // Height 3: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "bge 88b\n"
+ "89:" // Height 3: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ "90:" // Height 3: Multiply loop: Main loop skip
+ "cbz x11, 95f\n"
+ "cmp x11, #0x4\n"
+ "blt 92f\n"
+ "91:" // Height 3: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "bge 91b\n"
+ "cbz x11, 95f\n"
+ "92:" // Height 3: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 93f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "tbz x11, #0, 94f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "b 94f\n"
+ "93:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "94:" // Height 3: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ "95:" // Height 3: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 85b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "bge 104f\n"
+ "tbz x15, #3, 99f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "tbz x15, #2, 97f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 96f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "b 103f\n"
+ "96:" // Height 3: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 103f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "b 103f\n"
+ "97:" // Height 3: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 98f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "b 103f\n"
+ "98:" // Height 3: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 103f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "b 103f\n"
+ "99:" // Height 3: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 101f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "tbz x15, #1, 100f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "b 103f\n"
+ "100:" // Height 3: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 103f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "b 103f\n"
+ "101:" // Height 3: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 102f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "tbz x15, #0, 103f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "b 103f\n"
+ "102:" // Height 3: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "103:" // Height 3: Partial direct writeback: Done
+ "b 105f\n"
+ "104:" // Height 3: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "105:" // Height 3: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 73b\n"
+ "b 212f\n"
+ "106:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 107f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 108f\n"
+ "107:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "108:" // Height 4: Column loop
+ "tbz %x[flags], #0, 118f\n"
+ "cmp x15, #0x10\n"
+ "bge 117f\n"
+ "tbz x15, #3, 112f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 110f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 109f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "b 116f\n"
+ "109:" // Height 4: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "b 116f\n"
+ "110:" // Height 4: Partial accumulate: partial_2_8
+ "tbz x15, #1, 111f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "b 116f\n"
+ "111:" // Height 4: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "b 116f\n"
+ "112:" // Height 4: Partial accumulate: partial_4_0
+ "tbz x15, #2, 114f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 113f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "b 116f\n"
+ "113:" // Height 4: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 116f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "b 116f\n"
+ "114:" // Height 4: Partial accumulate: partial_2_0
+ "tbz x15, #1, 115f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 116f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "b 116f\n"
+ "115:" // Height 4: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "116:" // Height 4: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "b 119f\n"
+ "117:" // Height 4: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "b 119f\n"
+ "118:" // Height 4: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "119:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "120:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 121f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 122f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 122f\n"
+ "121:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "122:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "blt 125f\n"
+ "cmp x11, #0x20\n"
+ "blt 124f\n"
+ "123:" // Height 4: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "bge 123b\n"
+ "124:" // Height 4: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x26, x26, #0x10\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ "125:" // Height 4: Multiply loop: Main loop skip
+ "cbz x11, 130f\n"
+ "cmp x11, #0x4\n"
+ "blt 127f\n"
+ "126:" // Height 4: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "bge 126b\n"
+ "cbz x11, 130f\n"
+ "127:" // Height 4: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 128f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "tbz x11, #0, 129f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "b 129f\n"
+ "128:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "129:" // Height 4: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ "130:" // Height 4: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 120b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "bge 139f\n"
+ "tbz x15, #3, 134f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "tbz x15, #2, 132f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 131f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "b 138f\n"
+ "131:" // Height 4: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 138f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "b 138f\n"
+ "132:" // Height 4: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 133f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "b 138f\n"
+ "133:" // Height 4: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 138f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "b 138f\n"
+ "134:" // Height 4: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 136f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "tbz x15, #1, 135f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "b 138f\n"
+ "135:" // Height 4: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 138f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "b 138f\n"
+ "136:" // Height 4: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 137f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "tbz x15, #0, 138f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "b 138f\n"
+ "137:" // Height 4: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "138:" // Height 4: Partial direct writeback: Done
+ "b 140f\n"
+ "139:" // Height 4: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "140:" // Height 4: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 108b\n"
+ "b 212f\n"
+ "141:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 142f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 143f\n"
+ "142:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "143:" // Height 5: Column loop
+ "tbz %x[flags], #0, 153f\n"
+ "cmp x15, #0x10\n"
+ "bge 152f\n"
+ "tbz x15, #3, 147f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 145f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 144f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "b 151f\n"
+ "144:" // Height 5: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "b 151f\n"
+ "145:" // Height 5: Partial accumulate: partial_2_8
+ "tbz x15, #1, 146f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "b 151f\n"
+ "146:" // Height 5: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "b 151f\n"
+ "147:" // Height 5: Partial accumulate: partial_4_0
+ "tbz x15, #2, 149f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 148f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "b 151f\n"
+ "148:" // Height 5: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 151f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "b 151f\n"
+ "149:" // Height 5: Partial accumulate: partial_2_0
+ "tbz x15, #1, 150f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 151f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "b 151f\n"
+ "150:" // Height 5: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "151:" // Height 5: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "b 154f\n"
+ "152:" // Height 5: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "b 154f\n"
+ "153:" // Height 5: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "154:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "155:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 156f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 157f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 157f\n"
+ "156:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "157:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "blt 160f\n"
+ "cmp x11, #0x20\n"
+ "blt 159f\n"
+ "158:" // Height 5: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ "bge 158b\n"
+ "159:" // Height 5: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x24, x24, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ "160:" // Height 5: Multiply loop: Main loop skip
+ "cbz x11, 165f\n"
+ "cmp x11, #0x4\n"
+ "blt 162f\n"
+ "161:" // Height 5: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "bge 161b\n"
+ "cbz x11, 165f\n"
+ "162:" // Height 5: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 163f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "tbz x11, #0, 164f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "b 164f\n"
+ "163:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "164:" // Height 5: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ "165:" // Height 5: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 155b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "bge 174f\n"
+ "tbz x15, #3, 169f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "tbz x15, #2, 167f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 166f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "b 173f\n"
+ "166:" // Height 5: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 173f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "b 173f\n"
+ "167:" // Height 5: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 168f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "b 173f\n"
+ "168:" // Height 5: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 173f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "b 173f\n"
+ "169:" // Height 5: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 171f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "tbz x15, #1, 170f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "b 173f\n"
+ "170:" // Height 5: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 173f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "b 173f\n"
+ "171:" // Height 5: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 172f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "tbz x15, #0, 173f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "b 173f\n"
+ "172:" // Height 5: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "173:" // Height 5: Partial direct writeback: Done
+ "b 175f\n"
+ "174:" // Height 5: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "175:" // Height 5: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 143b\n"
+ "b 212f\n"
+ "176:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 177f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 178f\n"
+ "177:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "178:" // Height 6: Column loop
+ "tbz %x[flags], #0, 188f\n"
+ "cmp x15, #0x10\n"
+ "bge 187f\n"
+ "tbz x15, #3, 182f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "ld1 { v9.4s }, [x13], #0x10\n"
+ "ld1 { v13.4s }, [x9], #0x10\n"
+ "ld1 { v17.4s }, [x27], #0x10\n"
+ "ld1 { v21.4s }, [x25], #0x10\n"
+ "ld1 { v25.4s }, [x23], #0x10\n"
+ "ld1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 180f\n"
+ "ld1 { v10.4s }, [x13], #0x10\n"
+ "ld1 { v14.4s }, [x9], #0x10\n"
+ "ld1 { v18.4s }, [x27], #0x10\n"
+ "ld1 { v22.4s }, [x25], #0x10\n"
+ "ld1 { v26.4s }, [x23], #0x10\n"
+ "ld1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 179f\n"
+ "mov x19, #0x38\n"
+ "ldr d11, [x13], #0x8\n"
+ "ldr d15, [x9], #0x8\n"
+ "ldr d19, [x27], #0x8\n"
+ "ldr d23, [x25], #0x8\n"
+ "ldr d27, [x23], #0x8\n"
+ "ldr d31, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v11.s }[2], [x13]\n"
+ "ld1 { v15.s }[2], [x9]\n"
+ "ld1 { v19.s }[2], [x27]\n"
+ "ld1 { v23.s }[2], [x25]\n"
+ "ld1 { v27.s }[2], [x23]\n"
+ "ld1 { v31.s }[2], [x21]\n"
+ "b 186f\n"
+ "179:" // Height 6: Partial accumulate: partial_1_12
+ "mov x19, #0x30\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s11, [x13, #0x0]\n"
+ "ldr s15, [x9, #0x0]\n"
+ "ldr s19, [x27, #0x0]\n"
+ "ldr s23, [x25, #0x0]\n"
+ "ldr s27, [x23, #0x0]\n"
+ "ldr s31, [x21, #0x0]\n"
+ "b 186f\n"
+ "180:" // Height 6: Partial accumulate: partial_2_8
+ "tbz x15, #1, 181f\n"
+ "ldr d10, [x13], #0x8\n"
+ "ldr d14, [x9], #0x8\n"
+ "ldr d18, [x27], #0x8\n"
+ "ldr d22, [x25], #0x8\n"
+ "ldr d26, [x23], #0x8\n"
+ "ldr d30, [x21], #0x8\n"
+ "mov x19, #0x28\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v10.s }[2], [x13]\n"
+ "ld1 { v14.s }[2], [x9]\n"
+ "ld1 { v18.s }[2], [x27]\n"
+ "ld1 { v22.s }[2], [x25]\n"
+ "ld1 { v26.s }[2], [x23]\n"
+ "ld1 { v30.s }[2], [x21]\n"
+ "b 186f\n"
+ "181:" // Height 6: Partial accumulate: partial_1_8
+ "mov x19, #0x20\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s10, [x13, #0x0]\n"
+ "ldr s14, [x9, #0x0]\n"
+ "ldr s18, [x27, #0x0]\n"
+ "ldr s22, [x25, #0x0]\n"
+ "ldr s26, [x23, #0x0]\n"
+ "ldr s30, [x21, #0x0]\n"
+ "b 186f\n"
+ "182:" // Height 6: Partial accumulate: partial_4_0
+ "tbz x15, #2, 184f\n"
+ "ld1 { v8.4s }, [x13], #0x10\n"
+ "ld1 { v12.4s }, [x9], #0x10\n"
+ "ld1 { v16.4s }, [x27], #0x10\n"
+ "ld1 { v20.4s }, [x25], #0x10\n"
+ "ld1 { v24.4s }, [x23], #0x10\n"
+ "ld1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 183f\n"
+ "mov x19, #0x18\n"
+ "ldr d9, [x13], #0x8\n"
+ "ldr d13, [x9], #0x8\n"
+ "ldr d17, [x27], #0x8\n"
+ "ldr d21, [x25], #0x8\n"
+ "ldr d25, [x23], #0x8\n"
+ "ldr d29, [x21], #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v9.s }[2], [x13]\n"
+ "ld1 { v13.s }[2], [x9]\n"
+ "ld1 { v17.s }[2], [x27]\n"
+ "ld1 { v21.s }[2], [x25]\n"
+ "ld1 { v25.s }[2], [x23]\n"
+ "ld1 { v29.s }[2], [x21]\n"
+ "b 186f\n"
+ "183:" // Height 6: Partial accumulate: partial_1_4
+ "mov x19, #0x10\n"
+ "tbz x15, #0, 186f\n"
+ "ldr s9, [x13, #0x0]\n"
+ "ldr s13, [x9, #0x0]\n"
+ "ldr s17, [x27, #0x0]\n"
+ "ldr s21, [x25, #0x0]\n"
+ "ldr s25, [x23, #0x0]\n"
+ "ldr s29, [x21, #0x0]\n"
+ "b 186f\n"
+ "184:" // Height 6: Partial accumulate: partial_2_0
+ "tbz x15, #1, 185f\n"
+ "ldr d8, [x13], #0x8\n"
+ "ldr d12, [x9], #0x8\n"
+ "ldr d16, [x27], #0x8\n"
+ "ldr d20, [x25], #0x8\n"
+ "ldr d24, [x23], #0x8\n"
+ "ldr d28, [x21], #0x8\n"
+ "mov x19, #0x8\n"
+ "tbz x15, #0, 186f\n"
+ "ld1 { v8.s }[2], [x13]\n"
+ "ld1 { v12.s }[2], [x9]\n"
+ "ld1 { v16.s }[2], [x27]\n"
+ "ld1 { v20.s }[2], [x25]\n"
+ "ld1 { v24.s }[2], [x23]\n"
+ "ld1 { v28.s }[2], [x21]\n"
+ "b 186f\n"
+ "185:" // Height 6: Partial accumulate: partial_1_0
+ "mov x19, #0x0\n"
+ "ldr s8, [x13, #0x0]\n"
+ "ldr s12, [x9, #0x0]\n"
+ "ldr s16, [x27, #0x0]\n"
+ "ldr s20, [x25, #0x0]\n"
+ "ldr s24, [x23, #0x0]\n"
+ "ldr s28, [x21, #0x0]\n"
+ "186:" // Height 6: Partial accumulate: Done
+ "sub x13, x13, x19\n"
+ "sub x9, x9, x19\n"
+ "sub x27, x27, x19\n"
+ "sub x25, x25, x19\n"
+ "sub x23, x23, x19\n"
+ "sub x21, x21, x19\n"
+ "b 189f\n"
+ "187:" // Height 6: full accumulate
+ "ldr q8, [x13, #0x0]\n"
+ "ldr q9, [x13, #0x10]\n"
+ "ldr q10, [x13, #0x20]\n"
+ "ldr q11, [x13, #0x30]\n"
+ "ldr q12, [x9, #0x0]\n"
+ "ldr q13, [x9, #0x10]\n"
+ "ldr q14, [x9, #0x20]\n"
+ "ldr q15, [x9, #0x30]\n"
+ "ldr q16, [x27, #0x0]\n"
+ "ldr q17, [x27, #0x10]\n"
+ "ldr q18, [x27, #0x20]\n"
+ "ldr q19, [x27, #0x30]\n"
+ "ldr q20, [x25, #0x0]\n"
+ "ldr q21, [x25, #0x10]\n"
+ "ldr q22, [x25, #0x20]\n"
+ "ldr q23, [x25, #0x30]\n"
+ "ldr q24, [x23, #0x0]\n"
+ "ldr q25, [x23, #0x10]\n"
+ "ldr q26, [x23, #0x20]\n"
+ "ldr q27, [x23, #0x30]\n"
+ "ldr q28, [x21, #0x0]\n"
+ "ldr q29, [x21, #0x10]\n"
+ "ldr q30, [x21, #0x20]\n"
+ "ldr q31, [x21, #0x30]\n"
+ "b 189f\n"
+ "188:" // Height 6: no accumulate
+ "movi v8.4s, #0x0\n"
+ "movi v9.4s, #0x0\n"
+ "movi v10.4s, #0x0\n"
+ "movi v11.4s, #0x0\n"
+ "movi v12.4s, #0x0\n"
+ "movi v13.4s, #0x0\n"
+ "movi v14.4s, #0x0\n"
+ "movi v15.4s, #0x0\n"
+ "movi v16.4s, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "movi v18.4s, #0x0\n"
+ "movi v19.4s, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v21.4s, #0x0\n"
+ "movi v22.4s, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v24.4s, #0x0\n"
+ "movi v25.4s, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v27.4s, #0x0\n"
+ "movi v28.4s, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v30.4s, #0x0\n"
+ "movi v31.4s, #0x0\n"
+ "189:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "190:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 191f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 192f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 192f\n"
+ "191:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "192:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "blt 195f\n"
+ "cmp x11, #0x20\n"
+ "blt 194f\n"
+ "193:" // Height 6: Multiply loop: Main loop head
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sub x11, x11, #0x10\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ "cmp x11, #0x20\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
+ "bge 193b\n"
+ "194:" // Height 6: Multiply loop: Single iteration only
+ "sub x11, x11, #0x10\n"
+ "ldr q0, [x10, #0x0]\n"
+ "ldr q1, [x28, #0x0]\n"
+ "ldr q2, [x26, #0x0]\n"
+ "ldr q3, [x24, #0x0]\n"
+ "ldr q4, [x22, #0x0]\n"
+ "ldr q5, [x20, #0x0]\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "add x10, x10, #0x10\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ "add x22, x22, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x40]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x50]\n"
+ ".inst 0x6fa0e0c8 // udot v8.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0cc // udot v12.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d0 // udot v16.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d4 // udot v20.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0d8 // udot v24.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0dc // udot v28.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x60]\n"
+ ".inst 0x6fa0e0e9 // udot v9.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ed // udot v13.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f1 // udot v17.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f5 // udot v21.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0f9 // udot v25.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0fd // udot v29.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x70]\n"
+ ".inst 0x6fa0e0ca // udot v10.4s, v6.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ce // udot v14.4s, v6.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0d2 // udot v18.4s, v6.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0d6 // udot v22.4s, v6.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0da // udot v26.4s, v6.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0de // udot v30.4s, v6.16b, v5.4b[1]\n"
+ "ldr q6, [x14, #0x80]\n"
+ ".inst 0x6fa0e0eb // udot v11.4s, v7.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e0ef // udot v15.4s, v7.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e0f3 // udot v19.4s, v7.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e0f7 // udot v23.4s, v7.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e0fb // udot v27.4s, v7.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e0ff // udot v31.4s, v7.16b, v5.4b[1]\n"
+ "ldr q7, [x14, #0x90]\n"
+ ".inst 0x6f80e8c8 // udot v8.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8cc // udot v12.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d0 // udot v16.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d4 // udot v20.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8d8 // udot v24.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8dc // udot v28.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xa0]\n"
+ ".inst 0x6f80e8e9 // udot v9.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ed // udot v13.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f1 // udot v17.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f5 // udot v21.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8f9 // udot v25.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8fd // udot v29.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xb0]\n"
+ ".inst 0x6f80e8ca // udot v10.4s, v6.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ce // udot v14.4s, v6.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8d2 // udot v18.4s, v6.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8d6 // udot v22.4s, v6.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8da // udot v26.4s, v6.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8de // udot v30.4s, v6.16b, v5.4b[2]\n"
+ "ldr q6, [x14, #0xc0]\n"
+ ".inst 0x6f80e8eb // udot v11.4s, v7.16b, v0.4b[2]\n"
+ ".inst 0x6f81e8ef // udot v15.4s, v7.16b, v1.4b[2]\n"
+ ".inst 0x6f82e8f3 // udot v19.4s, v7.16b, v2.4b[2]\n"
+ ".inst 0x6f83e8f7 // udot v23.4s, v7.16b, v3.4b[2]\n"
+ ".inst 0x6f84e8fb // udot v27.4s, v7.16b, v4.4b[2]\n"
+ ".inst 0x6f85e8ff // udot v31.4s, v7.16b, v5.4b[2]\n"
+ "ldr q7, [x14, #0xd0]\n"
+ ".inst 0x6fa0e8c8 // udot v8.4s, v6.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8cc // udot v12.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d0 // udot v16.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d4 // udot v20.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8d8 // udot v24.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8dc // udot v28.4s, v6.16b, v5.4b[3]\n"
+ "ldr q6, [x14, #0xe0]\n"
+ ".inst 0x6fa0e8e9 // udot v9.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ed // udot v13.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f1 // udot v17.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f5 // udot v21.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8f9 // udot v25.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8fd // udot v29.4s, v7.16b, v5.4b[3]\n"
+ "ldr q7, [x14, #0xf0]\n"
+ ".inst 0x6fa0e8ca // udot v10.4s, v6.16b, v0.4b[3]\n"
+ "add x14, x14, #0x100\n"
+ ".inst 0x6fa1e8ce // udot v14.4s, v6.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8d2 // udot v18.4s, v6.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8d6 // udot v22.4s, v6.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8da // udot v26.4s, v6.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8de // udot v30.4s, v6.16b, v5.4b[3]\n"
+ ".inst 0x6fa0e8eb // udot v11.4s, v7.16b, v0.4b[3]\n"
+ ".inst 0x6fa1e8ef // udot v15.4s, v7.16b, v1.4b[3]\n"
+ ".inst 0x6fa2e8f3 // udot v19.4s, v7.16b, v2.4b[3]\n"
+ ".inst 0x6fa3e8f7 // udot v23.4s, v7.16b, v3.4b[3]\n"
+ ".inst 0x6fa4e8fb // udot v27.4s, v7.16b, v4.4b[3]\n"
+ ".inst 0x6fa5e8ff // udot v31.4s, v7.16b, v5.4b[3]\n"
+ "195:" // Height 6: Multiply loop: Main loop skip
+ "cbz x11, 200f\n"
+ "cmp x11, #0x4\n"
+ "blt 197f\n"
+ "196:" // Height 6: Multiply loop: Odd block loop
+ "ldr s0, [x10], #0x4\n"
+ "ldr s1, [x28], #0x4\n"
+ "ldr s2, [x26], #0x4\n"
+ "ldr s3, [x24], #0x4\n"
+ "ldr s4, [x22], #0x4\n"
+ "ldr s5, [x20], #0x4\n"
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ "sub x11, x11, #0x4\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ "cmp x11, #0x4\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "bge 196b\n"
+ "cbz x11, 200f\n"
+ "197:" // Height 6: Multiply loop: Skip odd blocks
+ "tbz x11, #1, 198f\n"
+ "ldr h0, [x10], #0x2\n"
+ "ldr h1, [x28], #0x2\n"
+ "ldr h2, [x26], #0x2\n"
+ "ldr h3, [x24], #0x2\n"
+ "ldr h4, [x22], #0x2\n"
+ "ldr h5, [x20], #0x2\n"
+ "tbz x11, #0, 199f\n"
+ "ld1 { v0.b }[2], [x10]\n"
+ "ld1 { v1.b }[2], [x28]\n"
+ "ld1 { v2.b }[2], [x26]\n"
+ "ld1 { v3.b }[2], [x24]\n"
+ "ld1 { v4.b }[2], [x22]\n"
+ "ld1 { v5.b }[2], [x20]\n"
+ "b 199f\n"
+ "198:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b0, [x10, #0x0]\n"
+ "ldr b1, [x28, #0x0]\n"
+ "ldr b2, [x26, #0x0]\n"
+ "ldr b3, [x24, #0x0]\n"
+ "ldr b4, [x22, #0x0]\n"
+ "ldr b5, [x20, #0x0]\n"
+ "199:" // Height 6: Multiply loop: Ragged operand read: Done
+ "ldr q6, [x14, #0x0]\n"
+ ".inst 0x6f80e0c8 // udot v8.4s, v6.16b, v0.4b[0]\n"
+ "ldr q7, [x14, #0x10]\n"
+ ".inst 0x6f81e0cc // udot v12.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d0 // udot v16.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d4 // udot v20.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0d8 // udot v24.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0dc // udot v28.4s, v6.16b, v5.4b[0]\n"
+ "ldr q6, [x14, #0x20]\n"
+ ".inst 0x6f80e0e9 // udot v9.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ed // udot v13.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f1 // udot v17.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f5 // udot v21.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0f9 // udot v25.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0fd // udot v29.4s, v7.16b, v5.4b[0]\n"
+ "ldr q7, [x14, #0x30]\n"
+ ".inst 0x6f80e0ca // udot v10.4s, v6.16b, v0.4b[0]\n"
+ "add x14, x14, #0x40\n"
+ ".inst 0x6f81e0ce // udot v14.4s, v6.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0d2 // udot v18.4s, v6.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0d6 // udot v22.4s, v6.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0da // udot v26.4s, v6.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0de // udot v30.4s, v6.16b, v5.4b[0]\n"
+ ".inst 0x6f80e0eb // udot v11.4s, v7.16b, v0.4b[0]\n"
+ ".inst 0x6f81e0ef // udot v15.4s, v7.16b, v1.4b[0]\n"
+ ".inst 0x6f82e0f3 // udot v19.4s, v7.16b, v2.4b[0]\n"
+ ".inst 0x6f83e0f7 // udot v23.4s, v7.16b, v3.4b[0]\n"
+ ".inst 0x6f84e0fb // udot v27.4s, v7.16b, v4.4b[0]\n"
+ ".inst 0x6f85e0ff // udot v31.4s, v7.16b, v5.4b[0]\n"
+ "200:" // Height 6: Multiply loop: No odd multiplies
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "add x12, x12, #0x1\n"
+ "cmp x12, x19\n"
+ "bne 190b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "cmp x15, #0x10\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "bge 209f\n"
+ "tbz x15, #3, 204f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v9.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v13.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v17.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v21.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v25.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "st1 { v29.4s }, [x21], #0x10\n"
+ "tbz x15, #2, 202f\n"
+ "st1 { v10.4s }, [x13], #0x10\n"
+ "st1 { v14.4s }, [x9], #0x10\n"
+ "st1 { v18.4s }, [x27], #0x10\n"
+ "st1 { v22.4s }, [x25], #0x10\n"
+ "st1 { v26.4s }, [x23], #0x10\n"
+ "st1 { v30.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 201f\n"
+ "str d11, [x13], #0x8\n"
+ "str d15, [x9], #0x8\n"
+ "str d19, [x27], #0x8\n"
+ "str d23, [x25], #0x8\n"
+ "str d27, [x23], #0x8\n"
+ "str d31, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v11.s }[2], [x13]\n"
+ "st1 { v15.s }[2], [x9]\n"
+ "st1 { v19.s }[2], [x27]\n"
+ "st1 { v23.s }[2], [x25]\n"
+ "st1 { v27.s }[2], [x23]\n"
+ "st1 { v31.s }[2], [x21]\n"
+ "b 208f\n"
+ "201:" // Height 6: Partial direct writeback: partial_1_12
+ "tbz x15, #0, 208f\n"
+ "str s11, [x13, #0x0]\n"
+ "str s15, [x9, #0x0]\n"
+ "str s19, [x27, #0x0]\n"
+ "str s23, [x25, #0x0]\n"
+ "str s27, [x23, #0x0]\n"
+ "str s31, [x21, #0x0]\n"
+ "b 208f\n"
+ "202:" // Height 6: Partial direct writeback: partial_2_8
+ "tbz x15, #1, 203f\n"
+ "str d10, [x13], #0x8\n"
+ "str d14, [x9], #0x8\n"
+ "str d18, [x27], #0x8\n"
+ "str d22, [x25], #0x8\n"
+ "str d26, [x23], #0x8\n"
+ "str d30, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v10.s }[2], [x13]\n"
+ "st1 { v14.s }[2], [x9]\n"
+ "st1 { v18.s }[2], [x27]\n"
+ "st1 { v22.s }[2], [x25]\n"
+ "st1 { v26.s }[2], [x23]\n"
+ "st1 { v30.s }[2], [x21]\n"
+ "b 208f\n"
+ "203:" // Height 6: Partial direct writeback: partial_1_8
+ "tbz x15, #0, 208f\n"
+ "str s10, [x13, #0x0]\n"
+ "str s14, [x9, #0x0]\n"
+ "str s18, [x27, #0x0]\n"
+ "str s22, [x25, #0x0]\n"
+ "str s26, [x23, #0x0]\n"
+ "str s30, [x21, #0x0]\n"
+ "b 208f\n"
+ "204:" // Height 6: Partial direct writeback: partial_4_0
+ "tbz x15, #2, 206f\n"
+ "st1 { v8.4s }, [x13], #0x10\n"
+ "st1 { v12.4s }, [x9], #0x10\n"
+ "st1 { v16.4s }, [x27], #0x10\n"
+ "st1 { v20.4s }, [x25], #0x10\n"
+ "st1 { v24.4s }, [x23], #0x10\n"
+ "st1 { v28.4s }, [x21], #0x10\n"
+ "tbz x15, #1, 205f\n"
+ "str d9, [x13], #0x8\n"
+ "str d13, [x9], #0x8\n"
+ "str d17, [x27], #0x8\n"
+ "str d21, [x25], #0x8\n"
+ "str d25, [x23], #0x8\n"
+ "str d29, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v9.s }[2], [x13]\n"
+ "st1 { v13.s }[2], [x9]\n"
+ "st1 { v17.s }[2], [x27]\n"
+ "st1 { v21.s }[2], [x25]\n"
+ "st1 { v25.s }[2], [x23]\n"
+ "st1 { v29.s }[2], [x21]\n"
+ "b 208f\n"
+ "205:" // Height 6: Partial direct writeback: partial_1_4
+ "tbz x15, #0, 208f\n"
+ "str s9, [x13, #0x0]\n"
+ "str s13, [x9, #0x0]\n"
+ "str s17, [x27, #0x0]\n"
+ "str s21, [x25, #0x0]\n"
+ "str s25, [x23, #0x0]\n"
+ "str s29, [x21, #0x0]\n"
+ "b 208f\n"
+ "206:" // Height 6: Partial direct writeback: partial_2_0
+ "tbz x15, #1, 207f\n"
+ "str d8, [x13], #0x8\n"
+ "str d12, [x9], #0x8\n"
+ "str d16, [x27], #0x8\n"
+ "str d20, [x25], #0x8\n"
+ "str d24, [x23], #0x8\n"
+ "str d28, [x21], #0x8\n"
+ "tbz x15, #0, 208f\n"
+ "st1 { v8.s }[2], [x13]\n"
+ "st1 { v12.s }[2], [x9]\n"
+ "st1 { v16.s }[2], [x27]\n"
+ "st1 { v20.s }[2], [x25]\n"
+ "st1 { v24.s }[2], [x23]\n"
+ "st1 { v28.s }[2], [x21]\n"
+ "b 208f\n"
+ "207:" // Height 6: Partial direct writeback: partial_1_0
+ "str s8, [x13, #0x0]\n"
+ "str s12, [x9, #0x0]\n"
+ "str s16, [x27, #0x0]\n"
+ "str s20, [x25, #0x0]\n"
+ "str s24, [x23, #0x0]\n"
+ "str s28, [x21, #0x0]\n"
+ "208:" // Height 6: Partial direct writeback: Done
+ "b 210f\n"
+ "209:" // Height 6: Full writeback
+ "str q8, [x13, #0x0]\n"
+ "str q9, [x13, #0x10]\n"
+ "str q10, [x13, #0x20]\n"
+ "str q11, [x13, #0x30]\n"
+ "str q12, [x9, #0x0]\n"
+ "str q13, [x9, #0x10]\n"
+ "str q14, [x9, #0x20]\n"
+ "str q15, [x9, #0x30]\n"
+ "str q16, [x27, #0x0]\n"
+ "str q17, [x27, #0x10]\n"
+ "str q18, [x27, #0x20]\n"
+ "str q19, [x27, #0x30]\n"
+ "str q20, [x25, #0x0]\n"
+ "str q21, [x25, #0x10]\n"
+ "str q22, [x25, #0x20]\n"
+ "str q23, [x25, #0x30]\n"
+ "str q24, [x23, #0x0]\n"
+ "str q25, [x23, #0x10]\n"
+ "str q26, [x23, #0x20]\n"
+ "str q27, [x23, #0x30]\n"
+ "str q28, [x21, #0x0]\n"
+ "str q29, [x21, #0x10]\n"
+ "str q30, [x21, #0x20]\n"
+ "str q31, [x21, #0x30]\n"
+ "add x13, x13, #0x40\n"
+ "add x9, x9, #0x40\n"
+ "add x27, x27, #0x40\n"
+ "add x25, x25, #0x40\n"
+ "add x23, x23, #0x40\n"
+ "add x21, x21, #0x40\n"
+ "210:" // Height 6: Writeback done
+ "subs x15, x15, #0x10\n"
+ "bgt 178b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 212f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 211f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "211:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "212:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
deleted file mode 100644
index 58a51432fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/x1.cpp
+++ /dev/null
@@ -1,328 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __aarch64__
-
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-
-namespace arm_gemm {
-
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
- const bfloat16 *a_ptr = Apanel;
- float *c_ptr = Cpanel;
-
- K /= 2;
- const long loops_count = (K / 2) - 1;
- const long tails_count = K % 2;
-
- for (int yb=0; yb<ablocks; yb++) {
- const bfloat16 *a_ptr0 = a_ptr;
- const bfloat16 *b_ptr = Bpanel;
-
- for (int xb=0; xb<bblocks; xb++) {
- a_ptr = a_ptr0;
- long loops = loops_count;
- long tails = tails_count;
-
- __asm __volatile (
- "movi v8.4s, #0\n"
- "ldr q0, [%[a_ptr]]\n"
- "movi v9.4s, #0\n"
- "ldr q2, [%[b_ptr]]\n"
- "movi v10.4s, #0\n"
- "ldr q1, [%[a_ptr], #0x10]\n"
- "movi v11.4s, #0\n"
- "ldr q3, [%[b_ptr], #0x10]\n"
- "movi v12.4s, #0\n"
- "ldr q4, [%[b_ptr], #0x20]\n"
- "movi v13.4s, #0\n"
- "add %[a_ptr], %[a_ptr], #0x20\n"
- "movi v14.4s, #0\n"
- "add %[b_ptr], %[b_ptr], #0x30\n"
- "movi v15.4s, #0\n"
- "movi v16.4s, #0\n"
- "movi v17.4s, #0\n"
- "movi v18.4s, #0\n"
- "movi v19.4s, #0\n"
- "movi v20.4s, #0\n"
- "movi v21.4s, #0\n"
- "movi v22.4s, #0\n"
- "movi v23.4s, #0\n"
- "movi v24.4s, #0\n"
- "movi v25.4s, #0\n"
- "movi v26.4s, #0\n"
- "movi v27.4s, #0\n"
- "movi v28.4s, #0\n"
- "movi v29.4s, #0\n"
- "movi v30.4s, #0\n"
- "movi v31.4s, #0\n"
- "cbz %[loops], 1f\n"
- "2:\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr]]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #0x10]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr]]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #0x20]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- "add %[b_ptr], %[b_ptr], #0x60\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr], #-0x30]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #-0x20]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr], #-0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #-0x10]\n"
- "ldr q1, [%[a_ptr], #-0x10]\n"
- "b.ne 2b\n"
- "1:\n"
- "cbz %[tails], 3f\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr]]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #0x10]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr]]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #0x20]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- "add %[a_ptr], %[a_ptr], #0x40\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- "add %[b_ptr], %[b_ptr], #0x60\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr], #-0x30]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #-0x20]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr], #-0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #-0x10]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #-0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- "str q8, [%[c_ptr]]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- "str q12, [%[c_ptr], #0x10]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "str q16, [%[c_ptr], #0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- "str q9, [%[c_ptr], #0x30]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "b 4f\n"
- "3:\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "add %[a_ptr], %[a_ptr], #0x20\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- "add %[b_ptr], %[b_ptr], #0x30\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- "ldr q2, [%[b_ptr], #-0x30]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- "ldr q3, [%[b_ptr], #-0x20]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "ldr q0, [%[a_ptr], #-0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "ldr q4, [%[b_ptr], #-0x10]\n"
- ".inst 0x4f40f048 // bfdot v8.4s, v2.8h, v0.h[0]\n"
- "ldr q1, [%[a_ptr], #-0x10]\n"
- ".inst 0x4f60f049 // bfdot v9.4s, v2.8h, v0.h[1]\n"
- ".inst 0x4f40f84a // bfdot v10.4s, v2.8h, v0.h[2]\n"
- ".inst 0x4f60f84b // bfdot v11.4s, v2.8h, v0.h[3]\n"
- "str q8, [%[c_ptr]]\n"
- ".inst 0x4f41f054 // bfdot v20.4s, v2.8h, v1.h[0]\n"
- ".inst 0x4f61f055 // bfdot v21.4s, v2.8h, v1.h[1]\n"
- ".inst 0x4f41f856 // bfdot v22.4s, v2.8h, v1.h[2]\n"
- ".inst 0x4f61f857 // bfdot v23.4s, v2.8h, v1.h[3]\n"
- ".inst 0x4f40f06c // bfdot v12.4s, v3.8h, v0.h[0]\n"
- ".inst 0x4f60f06d // bfdot v13.4s, v3.8h, v0.h[1]\n"
- ".inst 0x4f40f86e // bfdot v14.4s, v3.8h, v0.h[2]\n"
- ".inst 0x4f60f86f // bfdot v15.4s, v3.8h, v0.h[3]\n"
- "str q12, [%[c_ptr], #0x10]\n"
- ".inst 0x4f41f078 // bfdot v24.4s, v3.8h, v1.h[0]\n"
- ".inst 0x4f61f079 // bfdot v25.4s, v3.8h, v1.h[1]\n"
- ".inst 0x4f41f87a // bfdot v26.4s, v3.8h, v1.h[2]\n"
- ".inst 0x4f61f87b // bfdot v27.4s, v3.8h, v1.h[3]\n"
- ".inst 0x4f40f090 // bfdot v16.4s, v4.8h, v0.h[0]\n"
- ".inst 0x4f60f091 // bfdot v17.4s, v4.8h, v0.h[1]\n"
- ".inst 0x4f40f892 // bfdot v18.4s, v4.8h, v0.h[2]\n"
- ".inst 0x4f60f893 // bfdot v19.4s, v4.8h, v0.h[3]\n"
- "str q16, [%[c_ptr], #0x20]\n"
- ".inst 0x4f41f09c // bfdot v28.4s, v4.8h, v1.h[0]\n"
- ".inst 0x4f61f09d // bfdot v29.4s, v4.8h, v1.h[1]\n"
- ".inst 0x4f41f89e // bfdot v30.4s, v4.8h, v1.h[2]\n"
- "str q9, [%[c_ptr], #0x30]\n"
- ".inst 0x4f61f89f // bfdot v31.4s, v4.8h, v1.h[3]\n"
- "4:\n"
- "str q13, [%[c_ptr], #0x40]\n"
- "str q17, [%[c_ptr], #0x50]\n"
- "str q10, [%[c_ptr], #0x60]\n"
- "str q14, [%[c_ptr], #0x70]\n"
- "str q18, [%[c_ptr], #0x80]\n"
- "str q11, [%[c_ptr], #0x90]\n"
- "str q15, [%[c_ptr], #0xa0]\n"
- "str q19, [%[c_ptr], #0xb0]\n"
- "str q20, [%[c_ptr], #0xc0]\n"
- "str q24, [%[c_ptr], #0xd0]\n"
- "str q28, [%[c_ptr], #0xe0]\n"
- "str q21, [%[c_ptr], #0xf0]\n"
- "str q25, [%[c_ptr], #0x100]\n"
- "str q29, [%[c_ptr], #0x110]\n"
- "str q22, [%[c_ptr], #0x120]\n"
- "str q26, [%[c_ptr], #0x130]\n"
- "str q30, [%[c_ptr], #0x140]\n"
- "str q23, [%[c_ptr], #0x150]\n"
- "str q27, [%[c_ptr], #0x160]\n"
- "str q31, [%[c_ptr], #0x170]\n"
- "add %[c_ptr], %[c_ptr], #0x180\n"
- : [a_ptr] "+r" (a_ptr), [b_ptr] "+r" (b_ptr), [c_ptr] "+r" (c_ptr),
- [loops] "+r" (loops), [tails] "+r" (tails)
- :
- : "v0", "v1", "v2", "v3", "v4", "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "cc", "memory"
- );
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
index 95fed86c2f..2fea5ad2e7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12.hpp
@@ -31,10 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-void a64_interleaved_bf16fp32_dot_12x8_x1(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_dot_12x8 {
+class cls_a64_interleaved_bf16fp32_dot_8x12 {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -60,13 +59,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 2> transforms = {};
- kern_type kernel=a64_interleaved_bf16fp32_dot_12x8;
+ kern_type kernel=a64_interleaved_bf16fp32_dot_8x12;
- interleaved_bf16fp32_dot_12x8(const CPUInfo *ci)
+ cls_a64_interleaved_bf16fp32_dot_8x12(const CPUInfo *)
{
- if (ci->get_cpu_model() == CPUModel::X1) {
- kernel = a64_interleaved_bf16fp32_dot_12x8_x1;
- }
+
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
index 7ffae524dc..92149a5579 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_dot_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_bf16fp32_dot_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_dot_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
index 7fac59947e..b2c2407b28 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_mmla_12x8 {
+class cls_a64_interleaved_bf16fp32_mmla_8x12 {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 4> transforms = {};
- kern_type kernel=a64_interleaved_bf16fp32_mmla_12x8;
+ kern_type kernel=a64_interleaved_bf16fp32_mmla_8x12;
- interleaved_bf16fp32_mmla_12x8(const CPUInfo *)
+ cls_a64_interleaved_bf16fp32_mmla_8x12(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
index 7f0eff29af..c476fcf171 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_bf16fp32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_bf16fp32_mmla_8x12(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
@@ -87,13 +87,23 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
"movi v27.4s, #0\n"
"prfm PLDL1KEEP, [%[b_ptr], #0x1c0]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x200]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x240]\n"
"movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x240]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x280]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x280]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
"prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x380]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x3c0]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x400]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
"add %[a_ptr], %[a_ptr], #0x40\n"
"add %[b_ptr], %[b_ptr], #0x40\n"
"cbz %[loops], 1f\n"
@@ -105,19 +115,19 @@ void a64_interleaved_bf16fp32_mmla_12x8(const bfloat16 *Apanel, const bfloat16 *
".inst 0x6e44ec54 // bfmmla v20.4s, v2.8h, v4.8h\n"
"subs %[loops], %[loops], #0x1\n"
".inst 0x6e45ec09 // bfmmla v9.4s, v0.8h, v5.8h\n"
- "prfm PLDL1KEEP, [%[a_ptr], #0x1c0]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x2c0]\n"
".inst 0x6e44ec7a // bfmmla v26.4s, v3.8h, v4.8h\n"
"ldr q4, [%[b_ptr]]\n"
".inst 0x6e45ec2f // bfmmla v15.4s, v1.8h, v5.8h\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x2c0]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x440]\n"
".inst 0x6e45ec55 // bfmmla v21.4s, v2.8h, v5.8h\n"
- "prfm PLDL1KEEP, [%[a_ptr], #0x200]\n"
+ "prfm PLDL1KEEP, [%[a_ptr], #0x300]\n"
".inst 0x6e45ec7b // bfmmla v27.4s, v3.8h, v5.8h\n"
"ldr q5, [%[b_ptr], #0x10]\n"
".inst 0x6e46ec0a // bfmmla v10.4s, v0.8h, v6.8h\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x300]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x480]\n"
".inst 0x6e46ec30 // bfmmla v16.4s, v1.8h, v6.8h\n"
- "prfm PLDL1KEEP, [%[b_ptr], #0x340]\n"
+ "prfm PLDL1KEEP, [%[b_ptr], #0x4c0]\n"
".inst 0x6e46ec56 // bfmmla v22.4s, v2.8h, v6.8h\n"
".inst 0x6e46ec7c // bfmmla v28.4s, v3.8h, v6.8h\n"
"ldr q6, [%[b_ptr], #0x20]\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
index 7bfb2291a9..b17b76f170 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class interleaved_s8s32_mmla_12x8 {
+class cls_a64_interleaved_s8s32_mmla_8x12 {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
- kern_type kernel=a64_interleaved_s8s32_mmla_12x8;
+ kern_type kernel=a64_interleaved_s8s32_mmla_8x12;
- interleaved_s8s32_mmla_12x8(const CPUInfo *)
+ cls_a64_interleaved_s8s32_mmla_8x12(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
index 7953510aa7..2093e75b8e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_s8s32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_s8s32_mmla_12x8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_s8s32_mmla_8x12(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
index d493517cf1..99dd0be0d9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class interleaved_u8u32_mmla_12x8 {
+class cls_a64_interleaved_u8u32_mmla_8x12 {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsFixed<operand_type, result_type, 8, 12, 8> transforms = {};
+ StdTransformsFixed<operand_type, result_type, 8, 12, 8, true> transforms_quantized = {};
- kern_type kernel=a64_interleaved_u8u32_mmla_12x8;
+ kern_type kernel=a64_interleaved_u8u32_mmla_8x12;
- interleaved_u8u32_mmla_12x8(const CPUInfo *)
+ cls_a64_interleaved_u8u32_mmla_8x12(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
index dcd15f0345..568e5d1098 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_interleaved_u8u32_mmla_8x12/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void a64_interleaved_u8u32_mmla_12x8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void a64_interleaved_u8u32_mmla_8x12(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
index 981ce34b49..d77e1b0ac2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12.hpp
@@ -30,13 +30,13 @@
namespace arm_gemm {
// Actual kernel implementations
-void a64_sgemm_asimd_12x8(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a53(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_a55r1(const float *, const float *, float *, int, int, int);
-void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a53(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_a55r1(const float *, const float *, float *, int, int, int);
+void a64_sgemm_asimd_8x12_x1(const float *, const float *, float *, int, int, int);
-// 12x8 SGEMM "strategy" class.
+// 8x12 SGEMM "strategy" class.
//
// This describes the characteristics of a family of kernels, in terms of
// the required interleave properties and the output block size.
@@ -44,7 +44,7 @@ void a64_sgemm_asimd_12x8_x1(const float *, const float *, float *, int, int, in
// All kernels in the family must share these characteristics. The actual
// kernel to be used can be chosen at runtime, based on the CPU_type
// structure.
-class sgemm_12x8 {
+class cls_a64_sgemm_8x12 {
public:
typedef float operand_type;
typedef float result_type;
@@ -83,25 +83,25 @@ public:
}
}
- kern_type kernel=a64_sgemm_asimd_12x8;
+ kern_type kernel=a64_sgemm_asimd_8x12;
- sgemm_12x8(const CPUInfo *ci) {
+ cls_a64_sgemm_8x12(const CPUInfo *ci) {
// Select specific kernel if available
switch(ci->get_cpu_model()) {
case CPUModel::A53:
- kernel = a64_sgemm_asimd_12x8_a53;
+ kernel = a64_sgemm_asimd_8x12_a53;
break;
case CPUModel::A55r0:
- kernel = a64_sgemm_asimd_12x8_a55;
+ kernel = a64_sgemm_asimd_8x12_a55;
break;
case CPUModel::A55r1:
- kernel = a64_sgemm_asimd_12x8_a55r1;
+ kernel = a64_sgemm_asimd_8x12_a55r1;
break;
case CPUModel::X1:
- kernel = a64_sgemm_asimd_12x8_x1;
+ kernel = a64_sgemm_asimd_8x12_x1;
break;
default:
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
index 5532485efb..f4b6e7b70f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a53.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a53.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a53(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
index e9f071f7f4..5f86da8ef3 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_a55(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
index 8a6fbacfad..7709ad1be6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/a55r1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/a55r1.cpp
@@ -29,7 +29,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
+void a64_sgemm_asimd_8x12_a55r1(const float *Apanel, const float *Bpanel, float *Cpanel, const int ablocks, const int bblocks, const int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
index 48dc46785e..dc72095a9b 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/generic.cpp
@@ -39,7 +39,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
index 63fdf4df9f..89f8ac2d6c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_12x8/x1.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_sgemm_8x12/x1.cpp
@@ -39,7 +39,7 @@
namespace arm_gemm {
-void a64_sgemm_asimd_12x8_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void a64_sgemm_asimd_8x12_x1(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
index 6f31efe6cb..5f7252f019 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4.hpp
@@ -25,13 +25,15 @@
#ifdef __aarch64__
+
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x6(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_6x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-class smallK_hybrid_fp32_mla_4x6
+class cls_a64_smallK_hybrid_fp32_mla_6x4
{
public:
typedef float operand_type;
@@ -73,9 +75,9 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 4, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_fp32_mla_4x6;
+ kern_type kernel=a64_smallK_hybrid_fp32_mla_6x4;
- smallK_hybrid_fp32_mla_4x6(const CPUInfo *)
+ cls_a64_smallK_hybrid_fp32_mla_6x4(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
index e2fec6af16..52548b462c 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_6x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_fp32_mla_4x6(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_6x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(float);
const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
index e9a094855a..a8e0c24eae 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4.hpp
@@ -25,13 +25,15 @@
#ifdef __aarch64__
+
+
namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_fp32_mla_4x8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void a64_smallK_hybrid_fp32_mla_8x4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-class smallK_hybrid_fp32_mla_4x8
+class cls_a64_smallK_hybrid_fp32_mla_8x4
{
public:
typedef float operand_type;
@@ -73,9 +75,9 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 4, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_fp32_mla_4x8;
+ kern_type kernel=a64_smallK_hybrid_fp32_mla_8x4;
- smallK_hybrid_fp32_mla_4x8(const CPUInfo *)
+ cls_a64_smallK_hybrid_fp32_mla_8x4(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
index 11888bce74..deaef27ee9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_fp32_mla_8x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_fp32_mla_4x8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void a64_smallK_hybrid_fp32_mla_8x4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(float);
const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
index fc087b73db..abf0eda008 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-class smallK_hybrid_s8s32_dot_4x6
+class cls_a64_smallK_hybrid_s8s32_dot_6x4
{
public:
typedef int8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x6;
+ kern_type kernel=a64_smallK_hybrid_s8s32_dot_6x4;
- smallK_hybrid_s8s32_dot_4x6(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_s8s32_dot_6x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_s8s32_dot_4x6_a55;
+ kernel = a64_smallK_hybrid_s8s32_dot_6x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
index 2d6d2f064c..a9926602fc 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v22.d[1], temploadreg2\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
- "ins v22.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v23.d[1], temploadreg3\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
"add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
- "ins v23.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v24.d[1], temploadreg0\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr d21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ "ins v21.d[1], temploadreg1\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
"add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,27 +2503,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2366,19 +2630,117 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2775,27 +3155,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2913,19 +3289,124 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,27 +3821,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3486,19 +3963,132 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
index 88ad36a27a..9ff39719f7 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_6x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,85 +1758,170 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x4face27e // sdot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x4fafe27f // sdot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80ea9a // sdot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x4f83ea9b // sdot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x4f86ea9c // sdot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x4f89ea9d // sdot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x4f8cea9e // sdot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x4f8fea9f // sdot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x4fa0eaba // sdot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x4fa3eabb // sdot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x4fa6eabc // sdot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x4fa9eabd // sdot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x4faceabe // sdot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x4fafeabf // sdot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x4f81e2da // sdot v26.4s, v22.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x4f84e2db // sdot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x4f87e2dc // sdot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x4f8ae2dd // sdot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x4f8de2de // sdot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x4f90e2df // sdot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x4fa1e2fa // sdot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x4fa4e2fb // sdot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x4fa7e2fc // sdot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x4faae2fd // sdot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x4fade2fe // sdot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x4fb0e2ff // sdot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f87eb1c // sdot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8aeb1d // sdot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8deb1e // sdot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f90eb1f // sdot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa7eb3c // sdot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4faaeb3d // sdot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4fadeb3e // sdot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb0eb3f // sdot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4f82e25a // sdot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x4f85e25b // sdot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x4f88e25c // sdot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x4f8be25d // sdot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x4f8ee25e // sdot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x4f91e25f // sdot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x4fa2e27a // sdot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x4fa5e27b // sdot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x4fa8e27c // sdot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x4fabe27d // sdot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x4faee27e // sdot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x4fb1e27f // sdot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x4f82ea9a // sdot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x4f85ea9b // sdot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x4f88ea9c // sdot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x4f8bea9d // sdot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x4f8eea9e // sdot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x4f91ea9f // sdot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x4fa2eaba // sdot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x4fa5eabb // sdot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x4fa8eabc // sdot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e25a // sdot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x4f83e25b // sdot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x4f86e25c // sdot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x4f89e25d // sdot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x4f8ce25e // sdot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4f8fe25f // sdot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e27a // sdot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x4fa3e27b // sdot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x4fa6e27c // sdot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x4fa9e27d // sdot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1672,19 +1991,14 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeabd // sdot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x4faeeabe // sdot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x4fb1eabf // sdot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,24 +2731,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2400,38 +2850,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2528,7 +2976,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
@@ -2540,6 +2987,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -2659,19 +3107,124 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2790,24 +3343,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2915,38 +3470,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3051,20 +3604,20 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3190,19 +3743,132 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_s8s32_dot_4x6(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e31a // sdot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x4f84e31b // sdot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x4f88e31c // sdot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x4f8ce31d // sdot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x4f90e31e // sdot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x4f94e31f // sdot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0e33a // sdot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x4fa4e33b // sdot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x4fa8e33c // sdot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x4face33d // sdot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x4fb0e33e // sdot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x4fb4e33f // sdot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f80eb1a // sdot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f84eb1b // sdot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x4f88eb1c // sdot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x4f8ceb1d // sdot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x4f90eb1e // sdot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x4f94eb1f // sdot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa0eb3a // sdot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x4fa4eb3b // sdot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x4fa8eb3c // sdot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x4faceb3d // sdot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x4fb0eb3e // sdot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x4fb4eb3f // sdot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81e31a // sdot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85e31b // sdot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x4f89e31c // sdot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x4f8de31d // sdot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x4f91e31e // sdot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x4f95e31f // sdot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1e33a // sdot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x4fa5e33b // sdot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x4fa9e33c // sdot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x4fade33d // sdot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x4fb1e33e // sdot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x4fb5e33f // sdot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f81eb1a // sdot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f85eb1b // sdot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x4f89eb1c // sdot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x4f8deb1d // sdot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x4f91eb1e // sdot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x4f95eb1f // sdot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa1eb3a // sdot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x4fa5eb3b // sdot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x4fa9eb3c // sdot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x4fadeb3d // sdot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x4fb1eb3e // sdot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x4fb5eb3f // sdot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82e31a // sdot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86e31b // sdot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x4f8ae31c // sdot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x4f8ee31d // sdot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x4f92e31e // sdot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x4f96e31f // sdot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2e33a // sdot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x4fa6e33b // sdot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x4faae33c // sdot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x4faee33d // sdot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x4fb2e33e // sdot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x4fb6e33f // sdot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f82eb1a // sdot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f86eb1b // sdot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x4f8aeb1c // sdot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x4f8eeb1d // sdot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x4f92eb1e // sdot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x4f96eb1f // sdot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa2eb3a // sdot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x4fa6eb3b // sdot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x4faaeb3c // sdot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x4faeeb3d // sdot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x4fb2eb3e // sdot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x4fb6eb3f // sdot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83e31a // sdot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87e31b // sdot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x4f8be31c // sdot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x4f8fe31d // sdot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x4f93e31e // sdot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x4f97e31f // sdot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x4fa3e33a // sdot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x4fa7e33b // sdot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x4fabe33c // sdot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x4fafe33d // sdot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x4fb3e33e // sdot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x4fb7e33f // sdot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x4f83eb1a // sdot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x4f87eb1b // sdot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x4f8beb1c // sdot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x4f8feb1d // sdot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x4f93eb1e // sdot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x4f97eb1f // sdot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x4fa3eb3a // sdot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x4fa7eb3b // sdot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x4fabeb3c // sdot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x4fafeb3d // sdot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x4fb3eb3e // sdot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x4fb7eb3f // sdot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
index 3de708cc68..9f9c2a49db 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-class smallK_hybrid_s8s32_dot_4x8
+class cls_a64_smallK_hybrid_s8s32_dot_8x4
{
public:
typedef int8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_s8s32_dot_4x8;
+ kern_type kernel=a64_smallK_hybrid_s8s32_dot_8x4;
- smallK_hybrid_s8s32_dot_4x8(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_s8s32_dot_8x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_s8s32_dot_4x8_a55;
+ kernel = a64_smallK_hybrid_s8s32_dot_8x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
index 7135f2eee6..aba6e0d100 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4_a55(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q26, [c_ptr2]\n"
- "movi v26.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ "movi v26.4s, #0\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,146 +2005,132 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2038,7 +2147,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
@@ -2079,23 +2188,74 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ins v22.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr d23, [%[b_ptr0], #0x70]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ins v23.d[1], temploadreg3\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8_a55(const int8_t *A, int lda, const int8_t
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
index c94e975754..7fcf853d2e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_s8s32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void a64_smallK_hybrid_s8s32_dot_8x4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q25, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f81e219 // sdot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x4f82e21a // sdot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f83e21b // sdot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x4f84e21c // sdot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f85e21d // sdot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x4f86e21e // sdot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f87e21f // sdot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa1e239 // sdot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x4fa2e23a // sdot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa3e23b // sdot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x4fa4e23c // sdot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa5e23d // sdot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x4fa6e23e // sdot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa7e23f // sdot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f81ea59 // sdot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x4f82ea5a // sdot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f83ea5b // sdot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x4f84ea5c // sdot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f85ea5d // sdot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x4f86ea5e // sdot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f87ea5f // sdot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa1ea79 // sdot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x4fa2ea7a // sdot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa3ea7b // sdot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x4fa4ea7c // sdot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa5ea7d // sdot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x4fa6ea7e // sdot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa7ea7f // sdot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x4fa8e23c // sdot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x4faae23d // sdot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x4face23e // sdot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x4faee23f // sdot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x4f80ea58 // sdot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x4f82ea59 // sdot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x4f84ea5a // sdot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x4f86ea5b // sdot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x4f88ea5c // sdot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x4f8aea5d // sdot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x4f8cea5e // sdot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x4f8eea5f // sdot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x4fa0ea78 // sdot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x4fa2ea79 // sdot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x4fa4ea7a // sdot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x4fa6ea7b // sdot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x4fa8ea7c // sdot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x4faaea7d // sdot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x4facea7e // sdot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x4faeea7f // sdot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x4f81e298 // sdot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x4f83e299 // sdot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x4f85e29a // sdot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x4f87e29b // sdot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x4f89e29c // sdot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x4f8be29d // sdot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x4f8de29e // sdot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x4f8fe29f // sdot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x4fa1e2b8 // sdot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x4fa3e2b9 // sdot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x4fa5e2ba // sdot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x4fa7e2bb // sdot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x4fa9e2bc // sdot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x4fabe2bd // sdot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x4fade2be // sdot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x4fafe2bf // sdot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x4f81ead8 // sdot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x4f83ead9 // sdot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x4f85eada // sdot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x4f87eadb // sdot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x4f89eadc // sdot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x4f8beadd // sdot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x4f8deade // sdot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x4f8feadf // sdot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x4fa1eaf8 // sdot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x4fa3eaf9 // sdot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x4fa5eafa // sdot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x4fa7eafb // sdot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x4fa9eafc // sdot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x4f80e218 // sdot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x4f82e219 // sdot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x4f84e21a // sdot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x4f86e21b // sdot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x4f88e21c // sdot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x4f8ae21d // sdot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x4f8ce21e // sdot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4f8ee21f // sdot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x4fa0e238 // sdot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x4fa2e239 // sdot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x4fa4e23a // sdot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x4fa6e23b // sdot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_s8s32_dot_4x8(const int8_t *A, int lda, const int8_t *B,
".inst 0x4fabeafd // sdot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x4fadeafe // sdot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x4fafeaff // sdot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
index 76931db4dd..5d48a52d42 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-class smallK_hybrid_u8u32_dot_4x6
+class cls_a64_smallK_hybrid_u8u32_dot_6x4
{
public:
typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 6, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x6;
+ kern_type kernel=a64_smallK_hybrid_u8u32_dot_6x4;
- smallK_hybrid_u8u32_dot_4x6(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_u8u32_dot_6x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_u8u32_dot_4x6_a55;
+ kernel = a64_smallK_hybrid_u8u32_dot_6x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
index 02894d8327..dddf4c5aa2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -97,6 +97,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -107,18 +108,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -145,40 +157,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -222,173 +236,219 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v22.d[1], temploadreg2\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -435,19 +495,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -514,6 +569,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -524,24 +580,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -568,38 +635,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -652,180 +721,233 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
- "ins v22.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v23.d[1], temploadreg3\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -881,19 +1003,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1014,38 +1131,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1105,189 +1224,249 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
"add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
- "ins v23.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v24.d[1], temploadreg0\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1350,19 +1529,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1429,6 +1603,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1441,7 +1616,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1450,8 +1624,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1462,7 +1649,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1489,38 +1675,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1587,198 +1775,265 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d18, [%[b_ptr0]]\n"
+ "b.eq 7f\n"
+ "8:\n"
+ "str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0]]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "str q27, [c_ptr1]\n"
+ "add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
"ldr d19, [%[b_ptr0], #0x10]\n"
"ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
+ "movi v28.4s, #0\n"
"ldr d20, [%[b_ptr0], #0x20]\n"
"ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ "movi v29.4s, #0\n"
"ldr d21, [%[b_ptr0], #0x30]\n"
"ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "str q30, [c_ptr4]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ "movi v30.4s, #0\n"
"ldr d22, [%[b_ptr0], #0x40]\n"
"ins v18.d[1], temploadreg2\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "str q31, [c_ptr5]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ "movi v31.4s, #0\n"
"ldr temploadreg2, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"ldr d23, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
"ldr temploadreg3, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
"ldr d24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr temploadreg0, [%[b_ptr0], #0x68]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x70]\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"ins v21.d[1], temploadreg1\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
"ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
"ins v22.d[1], temploadreg2\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
"ins v23.d[1], temploadreg3\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
- "b.eq 7f\n"
- "8:\n"
- "str q26, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "movi v26.4s, #0\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "ldr d18, [%[b_ptr0]]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
"ldr temploadreg2, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "ldr d19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "ldr d20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr d21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ "ins v21.d[1], temploadreg1\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+ "b.ne 8b\n"
+ "7:\n"
+ "str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
"add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x8]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x18]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x28]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x38]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
- "ldr d22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr d23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr d24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr d18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x48]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr d19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x58]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr d20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
- "ins v23.d[1], temploadreg3\n"
".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x68]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr d21, [%[b_ptr0], #0x30]\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "b.ne 8b\n"
- "7:\n"
- "str q26, [%[c_ptr0]]\n"
- "add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "b 9f\n"
+ "6:\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
- "str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
- "str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
- "str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1848,19 +2103,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1927,6 +2177,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1943,18 +2194,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1981,24 +2249,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2091,57 +2361,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -2235,27 +2503,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2366,19 +2630,117 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2445,6 +2807,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2461,24 +2824,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2505,24 +2885,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2622,68 +3004,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -2775,27 +3155,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2913,19 +3289,124 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3052,24 +3533,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3177,57 +3660,55 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
- "ins v25.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
- "movi v28.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q29, [c_ptr3]\n"
- "movi v29.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
@@ -3340,27 +3821,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
- "ins v25.d[1], temploadreg1\n"
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3486,19 +3963,132 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3566,6 +4156,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3584,7 +4175,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3593,8 +4183,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3605,7 +4214,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3632,24 +4240,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3764,68 +4374,66 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v24.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr d24, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr d25, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q28, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v28.4s, #0\n"
+ "ins v24.d[1], temploadreg0\n"
+ "ins v25.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
"ins v24.d[1], temploadreg0\n"
".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
"ldr d25, [%[b_ptr0], #0x10]\n"
".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
"ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
@@ -3936,27 +4544,23 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
"ins v25.d[1], temploadreg1\n"
".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr d24, [%[b_ptr0]]\n"
".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
- "ins v24.d[1], temploadreg0\n"
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
- "ldr d25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
- "ins v25.d[1], temploadreg1\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -4089,19 +4693,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
index fe69f744e2..10bd16aa59 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x6/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_6x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_6x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -93,6 +93,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -103,18 +104,29 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2]\n"
"ldr s11, [a_ptr3]\n"
"ldr s14, [a_ptr4]\n"
"ldr s17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
+ "ldr q13, [a_ptr4], #0x10\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b2, [%[a_ptr0]]\n"
"ldr b5, [a_ptr1]\n"
@@ -141,40 +153,42 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[2], [a_ptr4]\n"
"ld1 {v17.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
@@ -218,139 +232,201 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
@@ -397,19 +473,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -468,6 +539,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -478,24 +550,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q4, [a_ptr1], #0x10\n"
"ldr q7, [a_ptr2], #0x10\n"
"ldr q10, [a_ptr3], #0x10\n"
- "ldr q13, [a_ptr4], #0x10\n"
- "ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d2, [%[a_ptr0]]\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1]\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2]\n"
"ldr d11, [a_ptr3]\n"
"ldr d14, [a_ptr4]\n"
"ldr d17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr s2, [%[a_ptr0]], #0x4\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr1], #0x4\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr s8, [a_ptr2], #0x4\n"
"ldr s11, [a_ptr3], #0x4\n"
"ldr s14, [a_ptr4], #0x4\n"
"ldr s17, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[4], [%[a_ptr0]]\n"
"ld1 {v5.b}[4], [a_ptr1]\n"
@@ -522,38 +605,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[6], [a_ptr4]\n"
"ld1 {v17.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -606,144 +691,213 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -799,19 +953,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -924,38 +1073,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[10], [a_ptr4]\n"
"ld1 {v17.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1015,62 +1166,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1081,85 +1230,163 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1222,19 +1449,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1293,6 +1515,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q3, [a_ptr1], #0x10\n"
"ldr q6, [a_ptr2], #0x10\n"
@@ -1305,7 +1528,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q10, [a_ptr3], #0x10\n"
"ldr q13, [a_ptr4], #0x10\n"
"ldr q16, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q2, [%[a_ptr0]]\n"
"ldr q5, [a_ptr1]\n"
"ldr q8, [a_ptr2]\n"
@@ -1314,8 +1536,21 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q17, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q3, [a_ptr1], #0x10\n"
+ "ldr q6, [a_ptr2], #0x10\n"
+ "ldr q9, [a_ptr3], #0x10\n"
+ "ldr q12, [a_ptr4], #0x10\n"
+ "ldr q15, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q7, [a_ptr2], #0x10\n"
+ "ldr q10, [a_ptr3], #0x10\n"
"ldr d2, [%[a_ptr0]], #0x8\n"
+ "ldr q13, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr1], #0x8\n"
+ "ldr q16, [a_ptr5], #0x10\n"
"ldr d8, [a_ptr2], #0x8\n"
"ldr d11, [a_ptr3], #0x8\n"
"ldr d14, [a_ptr4], #0x8\n"
@@ -1326,7 +1561,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v11.s}[2], [a_ptr3], #4\n"
"ld1 {v14.s}[2], [a_ptr4], #4\n"
"ld1 {v17.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v2.b}[12], [%[a_ptr0]]\n"
"ld1 {v5.b}[12], [a_ptr1]\n"
@@ -1353,38 +1587,40 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v14.b}[14], [a_ptr4]\n"
"ld1 {v17.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q18, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"ldr q21, [%[b_ptr0], #0x30]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
"ldr q22, [%[b_ptr0], #0x40]\n"
- "movi v31.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
"ldr q23, [%[b_ptr0], #0x50]\n"
- ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
"ldr q24, [%[b_ptr0], #0x60]\n"
- ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"ldr q25, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
@@ -1451,62 +1687,60 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q18, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
"ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
"ldr q20, [%[b_ptr0], #0x20]\n"
@@ -1524,85 +1758,170 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
- "ldr q22, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
- "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
- "ldr q24, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
- "ldr q18, [%[b_ptr0]]\n"
".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
- "ldr q19, [%[b_ptr0], #0x10]\n"
".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
- "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "ldr q21, [%[b_ptr0], #0x30]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ "ldr q22, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x50]\n"
".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ "ldr q24, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q25, [%[b_ptr0], #0x70]\n"
".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
+ "ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
+ ".inst 0x6face27e // udot v30.4s, v19.16b, v12.4b[1]\n"
+ ".inst 0x6fafe27f // udot v31.4s, v19.16b, v15.4b[1]\n"
+ "ldr q19, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80ea9a // udot v26.4s, v20.16b, v0.4b[2]\n"
+ ".inst 0x6f83ea9b // udot v27.4s, v20.16b, v3.4b[2]\n"
+ ".inst 0x6f86ea9c // udot v28.4s, v20.16b, v6.4b[2]\n"
+ ".inst 0x6f89ea9d // udot v29.4s, v20.16b, v9.4b[2]\n"
+ ".inst 0x6f8cea9e // udot v30.4s, v20.16b, v12.4b[2]\n"
+ ".inst 0x6f8fea9f // udot v31.4s, v20.16b, v15.4b[2]\n"
+ "ldr q20, [%[b_ptr0], #0x20]\n"
+ ".inst 0x6fa0eaba // udot v26.4s, v21.16b, v0.4b[3]\n"
+ ".inst 0x6fa3eabb // udot v27.4s, v21.16b, v3.4b[3]\n"
+ ".inst 0x6fa6eabc // udot v28.4s, v21.16b, v6.4b[3]\n"
+ ".inst 0x6fa9eabd // udot v29.4s, v21.16b, v9.4b[3]\n"
+ ".inst 0x6faceabe // udot v30.4s, v21.16b, v12.4b[3]\n"
+ ".inst 0x6fafeabf // udot v31.4s, v21.16b, v15.4b[3]\n"
+ "ldr q21, [%[b_ptr0], #0x30]\n"
+ ".inst 0x6f81e2da // udot v26.4s, v22.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ ".inst 0x6f84e2db // udot v27.4s, v22.16b, v4.4b[0]\n"
+ ".inst 0x6f87e2dc // udot v28.4s, v22.16b, v7.4b[0]\n"
+ ".inst 0x6f8ae2dd // udot v29.4s, v22.16b, v10.4b[0]\n"
+ ".inst 0x6f8de2de // udot v30.4s, v22.16b, v13.4b[0]\n"
+ ".inst 0x6f90e2df // udot v31.4s, v22.16b, v16.4b[0]\n"
+ ".inst 0x6fa1e2fa // udot v26.4s, v23.16b, v1.4b[1]\n"
+ ".inst 0x6fa4e2fb // udot v27.4s, v23.16b, v4.4b[1]\n"
+ ".inst 0x6fa7e2fc // udot v28.4s, v23.16b, v7.4b[1]\n"
+ ".inst 0x6faae2fd // udot v29.4s, v23.16b, v10.4b[1]\n"
+ ".inst 0x6fade2fe // udot v30.4s, v23.16b, v13.4b[1]\n"
+ ".inst 0x6fb0e2ff // udot v31.4s, v23.16b, v16.4b[1]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f87eb1c // udot v28.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8aeb1d // udot v29.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8deb1e // udot v30.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f90eb1f // udot v31.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa7eb3c // udot v28.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6faaeb3d // udot v29.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6fadeb3e // udot v30.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb0eb3f // udot v31.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6f82e25a // udot v26.4s, v18.16b, v2.4b[0]\n"
+ ".inst 0x6f85e25b // udot v27.4s, v18.16b, v5.4b[0]\n"
+ ".inst 0x6f88e25c // udot v28.4s, v18.16b, v8.4b[0]\n"
+ ".inst 0x6f8be25d // udot v29.4s, v18.16b, v11.4b[0]\n"
+ ".inst 0x6f8ee25e // udot v30.4s, v18.16b, v14.4b[0]\n"
+ ".inst 0x6f91e25f // udot v31.4s, v18.16b, v17.4b[0]\n"
+ ".inst 0x6fa2e27a // udot v26.4s, v19.16b, v2.4b[1]\n"
+ ".inst 0x6fa5e27b // udot v27.4s, v19.16b, v5.4b[1]\n"
+ ".inst 0x6fa8e27c // udot v28.4s, v19.16b, v8.4b[1]\n"
+ ".inst 0x6fabe27d // udot v29.4s, v19.16b, v11.4b[1]\n"
+ ".inst 0x6faee27e // udot v30.4s, v19.16b, v14.4b[1]\n"
+ ".inst 0x6fb1e27f // udot v31.4s, v19.16b, v17.4b[1]\n"
+ ".inst 0x6f82ea9a // udot v26.4s, v20.16b, v2.4b[2]\n"
+ ".inst 0x6f85ea9b // udot v27.4s, v20.16b, v5.4b[2]\n"
+ ".inst 0x6f88ea9c // udot v28.4s, v20.16b, v8.4b[2]\n"
+ ".inst 0x6f8bea9d // udot v29.4s, v20.16b, v11.4b[2]\n"
+ ".inst 0x6f8eea9e // udot v30.4s, v20.16b, v14.4b[2]\n"
+ ".inst 0x6f91ea9f // udot v31.4s, v20.16b, v17.4b[2]\n"
+ ".inst 0x6fa2eaba // udot v26.4s, v21.16b, v2.4b[3]\n"
+ ".inst 0x6fa5eabb // udot v27.4s, v21.16b, v5.4b[3]\n"
+ ".inst 0x6fa8eabc // udot v28.4s, v21.16b, v8.4b[3]\n"
+ ".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
+ ".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
+ ".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e25a // udot v26.4s, v18.16b, v0.4b[0]\n"
+ ".inst 0x6f83e25b // udot v27.4s, v18.16b, v3.4b[0]\n"
+ ".inst 0x6f86e25c // udot v28.4s, v18.16b, v6.4b[0]\n"
+ ".inst 0x6f89e25d // udot v29.4s, v18.16b, v9.4b[0]\n"
".inst 0x6f8ce25e // udot v30.4s, v18.16b, v12.4b[0]\n"
- ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6f8fe25f // udot v31.4s, v18.16b, v15.4b[0]\n"
"ldr q18, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e27a // udot v26.4s, v19.16b, v0.4b[1]\n"
".inst 0x6fa3e27b // udot v27.4s, v19.16b, v3.4b[1]\n"
".inst 0x6fa6e27c // udot v28.4s, v19.16b, v6.4b[1]\n"
".inst 0x6fa9e27d // udot v29.4s, v19.16b, v9.4b[1]\n"
@@ -1672,19 +1991,14 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeabd // udot v29.4s, v21.16b, v11.4b[3]\n"
".inst 0x6faeeabe // udot v30.4s, v21.16b, v14.4b[3]\n"
".inst 0x6fb1eabf // udot v31.4s, v21.16b, v17.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1743,6 +2057,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -1759,18 +2074,35 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2]\n"
"ldr s15, [a_ptr3]\n"
"ldr s19, [a_ptr4]\n"
"ldr s23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
+ "ldr q18, [a_ptr4], #0x10\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"b.ne 4f\n"
"ldr b3, [%[a_ptr0]]\n"
"ldr b7, [a_ptr1]\n"
@@ -1797,24 +2129,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[2], [a_ptr4]\n"
"ld1 {v23.b}[2], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -1907,38 +2241,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2028,20 +2360,20 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -2152,19 +2484,117 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2223,6 +2653,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -2239,24 +2670,41 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q6, [a_ptr1], #0x10\n"
"ldr q10, [a_ptr2], #0x10\n"
"ldr q14, [a_ptr3], #0x10\n"
- "ldr q18, [a_ptr4], #0x10\n"
- "ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d3, [%[a_ptr0]]\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1]\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2]\n"
"ldr d15, [a_ptr3]\n"
"ldr d19, [a_ptr4]\n"
"ldr d23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr s3, [%[a_ptr0]], #0x4\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr s7, [a_ptr1], #0x4\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr s11, [a_ptr2], #0x4\n"
"ldr s15, [a_ptr3], #0x4\n"
"ldr s19, [a_ptr4], #0x4\n"
"ldr s23, [a_ptr5], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[4], [%[a_ptr0]]\n"
"ld1 {v7.b}[4], [a_ptr1]\n"
@@ -2283,24 +2731,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[6], [a_ptr4]\n"
"ld1 {v23.b}[6], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2400,38 +2850,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -2528,7 +2976,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
@@ -2540,6 +2987,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -2659,19 +3107,124 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2790,24 +3343,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[10], [a_ptr4]\n"
"ld1 {v23.b}[10], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -2915,38 +3470,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"subs %[loops], %[loops], #0x1\n"
- "str q27, [c_ptr1]\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3051,20 +3604,20 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
- "ldr q25, [%[b_ptr0], #0x10]\n"
".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
"b.ne 8b\n"
"7:\n"
"str q26, [%[c_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "movi v26.4s, #0\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
+ "movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v27.4s, #0\n"
@@ -3190,19 +3743,132 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3262,6 +3928,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q4, [a_ptr1], #0x10\n"
"ldr q8, [a_ptr2], #0x10\n"
@@ -3280,7 +3947,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q14, [a_ptr3], #0x10\n"
"ldr q18, [a_ptr4], #0x10\n"
"ldr q22, [a_ptr5], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q3, [%[a_ptr0]]\n"
"ldr q7, [a_ptr1]\n"
"ldr q11, [a_ptr2]\n"
@@ -3289,8 +3955,27 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ldr q23, [a_ptr5]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q4, [a_ptr1], #0x10\n"
+ "ldr q8, [a_ptr2], #0x10\n"
+ "ldr q12, [a_ptr3], #0x10\n"
+ "ldr q16, [a_ptr4], #0x10\n"
+ "ldr q20, [a_ptr5], #0x10\n"
+ "ldr q1, [%[a_ptr0]], #0x10\n"
+ "ldr q5, [a_ptr1], #0x10\n"
+ "ldr q9, [a_ptr2], #0x10\n"
+ "ldr q13, [a_ptr3], #0x10\n"
+ "ldr q17, [a_ptr4], #0x10\n"
+ "ldr q21, [a_ptr5], #0x10\n"
+ "ldr q2, [%[a_ptr0]], #0x10\n"
+ "ldr q6, [a_ptr1], #0x10\n"
+ "ldr q10, [a_ptr2], #0x10\n"
+ "ldr q14, [a_ptr3], #0x10\n"
"ldr d3, [%[a_ptr0]], #0x8\n"
+ "ldr q18, [a_ptr4], #0x10\n"
"ldr d7, [a_ptr1], #0x8\n"
+ "ldr q22, [a_ptr5], #0x10\n"
"ldr d11, [a_ptr2], #0x8\n"
"ldr d15, [a_ptr3], #0x8\n"
"ldr d19, [a_ptr4], #0x8\n"
@@ -3301,7 +3986,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v15.s}[2], [a_ptr3], #4\n"
"ld1 {v19.s}[2], [a_ptr4], #4\n"
"ld1 {v23.s}[2], [a_ptr5], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v3.b}[12], [%[a_ptr0]]\n"
"ld1 {v7.b}[12], [a_ptr1]\n"
@@ -3328,24 +4012,26 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v19.b}[14], [a_ptr4]\n"
"ld1 {v23.b}[14], [a_ptr5]\n"
"3:\n"
- "movi v26.4s, #0\n"
"ldr q24, [%[b_ptr0]]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v26.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x100]\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x140]\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr5, #0x180]\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
@@ -3460,38 +4146,36 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q24, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q26, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q27, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v27.4s, #0\n"
+ "str q27, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v27.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
"str q28, [c_ptr2]\n"
"movi v28.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
"str q29, [c_ptr3]\n"
"movi v29.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
"str q30, [c_ptr4]\n"
"movi v30.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
"str q31, [c_ptr5]\n"
"movi v31.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
"add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
"ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
@@ -3603,7 +4287,6 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
- "ldr q24, [%[b_ptr0]]\n"
".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
@@ -3615,6 +4298,7 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v26.4s, #0\n"
+ "ldr q24, [%[b_ptr0]]\n"
"ldr q25, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q27, [c_ptr1]\n"
@@ -3749,19 +4433,139 @@ void a64_smallK_hybrid_u8u32_dot_4x6(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e31a // udot v26.4s, v24.16b, v0.4b[0]\n"
+ ".inst 0x6f84e31b // udot v27.4s, v24.16b, v4.4b[0]\n"
+ ".inst 0x6f88e31c // udot v28.4s, v24.16b, v8.4b[0]\n"
+ ".inst 0x6f8ce31d // udot v29.4s, v24.16b, v12.4b[0]\n"
+ ".inst 0x6f90e31e // udot v30.4s, v24.16b, v16.4b[0]\n"
+ ".inst 0x6f94e31f // udot v31.4s, v24.16b, v20.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0e33a // udot v26.4s, v25.16b, v0.4b[1]\n"
+ ".inst 0x6fa4e33b // udot v27.4s, v25.16b, v4.4b[1]\n"
+ ".inst 0x6fa8e33c // udot v28.4s, v25.16b, v8.4b[1]\n"
+ ".inst 0x6face33d // udot v29.4s, v25.16b, v12.4b[1]\n"
+ ".inst 0x6fb0e33e // udot v30.4s, v25.16b, v16.4b[1]\n"
+ ".inst 0x6fb4e33f // udot v31.4s, v25.16b, v20.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f80eb1a // udot v26.4s, v24.16b, v0.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f84eb1b // udot v27.4s, v24.16b, v4.4b[2]\n"
+ ".inst 0x6f88eb1c // udot v28.4s, v24.16b, v8.4b[2]\n"
+ ".inst 0x6f8ceb1d // udot v29.4s, v24.16b, v12.4b[2]\n"
+ ".inst 0x6f90eb1e // udot v30.4s, v24.16b, v16.4b[2]\n"
+ ".inst 0x6f94eb1f // udot v31.4s, v24.16b, v20.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa0eb3a // udot v26.4s, v25.16b, v0.4b[3]\n"
+ ".inst 0x6fa4eb3b // udot v27.4s, v25.16b, v4.4b[3]\n"
+ ".inst 0x6fa8eb3c // udot v28.4s, v25.16b, v8.4b[3]\n"
+ ".inst 0x6faceb3d // udot v29.4s, v25.16b, v12.4b[3]\n"
+ ".inst 0x6fb0eb3e // udot v30.4s, v25.16b, v16.4b[3]\n"
+ ".inst 0x6fb4eb3f // udot v31.4s, v25.16b, v20.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81e31a // udot v26.4s, v24.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85e31b // udot v27.4s, v24.16b, v5.4b[0]\n"
+ ".inst 0x6f89e31c // udot v28.4s, v24.16b, v9.4b[0]\n"
+ ".inst 0x6f8de31d // udot v29.4s, v24.16b, v13.4b[0]\n"
+ ".inst 0x6f91e31e // udot v30.4s, v24.16b, v17.4b[0]\n"
+ ".inst 0x6f95e31f // udot v31.4s, v24.16b, v21.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1e33a // udot v26.4s, v25.16b, v1.4b[1]\n"
+ ".inst 0x6fa5e33b // udot v27.4s, v25.16b, v5.4b[1]\n"
+ ".inst 0x6fa9e33c // udot v28.4s, v25.16b, v9.4b[1]\n"
+ ".inst 0x6fade33d // udot v29.4s, v25.16b, v13.4b[1]\n"
+ ".inst 0x6fb1e33e // udot v30.4s, v25.16b, v17.4b[1]\n"
+ ".inst 0x6fb5e33f // udot v31.4s, v25.16b, v21.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f81eb1a // udot v26.4s, v24.16b, v1.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f85eb1b // udot v27.4s, v24.16b, v5.4b[2]\n"
+ ".inst 0x6f89eb1c // udot v28.4s, v24.16b, v9.4b[2]\n"
+ ".inst 0x6f8deb1d // udot v29.4s, v24.16b, v13.4b[2]\n"
+ ".inst 0x6f91eb1e // udot v30.4s, v24.16b, v17.4b[2]\n"
+ ".inst 0x6f95eb1f // udot v31.4s, v24.16b, v21.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa1eb3a // udot v26.4s, v25.16b, v1.4b[3]\n"
+ ".inst 0x6fa5eb3b // udot v27.4s, v25.16b, v5.4b[3]\n"
+ ".inst 0x6fa9eb3c // udot v28.4s, v25.16b, v9.4b[3]\n"
+ ".inst 0x6fadeb3d // udot v29.4s, v25.16b, v13.4b[3]\n"
+ ".inst 0x6fb1eb3e // udot v30.4s, v25.16b, v17.4b[3]\n"
+ ".inst 0x6fb5eb3f // udot v31.4s, v25.16b, v21.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82e31a // udot v26.4s, v24.16b, v2.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86e31b // udot v27.4s, v24.16b, v6.4b[0]\n"
+ ".inst 0x6f8ae31c // udot v28.4s, v24.16b, v10.4b[0]\n"
+ ".inst 0x6f8ee31d // udot v29.4s, v24.16b, v14.4b[0]\n"
+ ".inst 0x6f92e31e // udot v30.4s, v24.16b, v18.4b[0]\n"
+ ".inst 0x6f96e31f // udot v31.4s, v24.16b, v22.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2e33a // udot v26.4s, v25.16b, v2.4b[1]\n"
+ ".inst 0x6fa6e33b // udot v27.4s, v25.16b, v6.4b[1]\n"
+ ".inst 0x6faae33c // udot v28.4s, v25.16b, v10.4b[1]\n"
+ ".inst 0x6faee33d // udot v29.4s, v25.16b, v14.4b[1]\n"
+ ".inst 0x6fb2e33e // udot v30.4s, v25.16b, v18.4b[1]\n"
+ ".inst 0x6fb6e33f // udot v31.4s, v25.16b, v22.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f82eb1a // udot v26.4s, v24.16b, v2.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f86eb1b // udot v27.4s, v24.16b, v6.4b[2]\n"
+ ".inst 0x6f8aeb1c // udot v28.4s, v24.16b, v10.4b[2]\n"
+ ".inst 0x6f8eeb1d // udot v29.4s, v24.16b, v14.4b[2]\n"
+ ".inst 0x6f92eb1e // udot v30.4s, v24.16b, v18.4b[2]\n"
+ ".inst 0x6f96eb1f // udot v31.4s, v24.16b, v22.4b[2]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa2eb3a // udot v26.4s, v25.16b, v2.4b[3]\n"
+ ".inst 0x6fa6eb3b // udot v27.4s, v25.16b, v6.4b[3]\n"
+ ".inst 0x6faaeb3c // udot v28.4s, v25.16b, v10.4b[3]\n"
+ ".inst 0x6faeeb3d // udot v29.4s, v25.16b, v14.4b[3]\n"
+ ".inst 0x6fb2eb3e // udot v30.4s, v25.16b, v18.4b[3]\n"
+ ".inst 0x6fb6eb3f // udot v31.4s, v25.16b, v22.4b[3]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83e31a // udot v26.4s, v24.16b, v3.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87e31b // udot v27.4s, v24.16b, v7.4b[0]\n"
+ ".inst 0x6f8be31c // udot v28.4s, v24.16b, v11.4b[0]\n"
+ ".inst 0x6f8fe31d // udot v29.4s, v24.16b, v15.4b[0]\n"
+ ".inst 0x6f93e31e // udot v30.4s, v24.16b, v19.4b[0]\n"
+ ".inst 0x6f97e31f // udot v31.4s, v24.16b, v23.4b[0]\n"
+ "ldr q24, [%[b_ptr0]]\n"
+ ".inst 0x6fa3e33a // udot v26.4s, v25.16b, v3.4b[1]\n"
+ ".inst 0x6fa7e33b // udot v27.4s, v25.16b, v7.4b[1]\n"
+ ".inst 0x6fabe33c // udot v28.4s, v25.16b, v11.4b[1]\n"
+ ".inst 0x6fafe33d // udot v29.4s, v25.16b, v15.4b[1]\n"
+ ".inst 0x6fb3e33e // udot v30.4s, v25.16b, v19.4b[1]\n"
+ ".inst 0x6fb7e33f // udot v31.4s, v25.16b, v23.4b[1]\n"
+ "ldr q25, [%[b_ptr0], #0x10]\n"
+ ".inst 0x6f83eb1a // udot v26.4s, v24.16b, v3.4b[2]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ ".inst 0x6f87eb1b // udot v27.4s, v24.16b, v7.4b[2]\n"
+ ".inst 0x6f8beb1c // udot v28.4s, v24.16b, v11.4b[2]\n"
+ ".inst 0x6f8feb1d // udot v29.4s, v24.16b, v15.4b[2]\n"
+ ".inst 0x6f93eb1e // udot v30.4s, v24.16b, v19.4b[2]\n"
+ ".inst 0x6f97eb1f // udot v31.4s, v24.16b, v23.4b[2]\n"
+ ".inst 0x6fa3eb3a // udot v26.4s, v25.16b, v3.4b[3]\n"
+ ".inst 0x6fa7eb3b // udot v27.4s, v25.16b, v7.4b[3]\n"
+ ".inst 0x6fabeb3c // udot v28.4s, v25.16b, v11.4b[3]\n"
+ ".inst 0x6fafeb3d // udot v29.4s, v25.16b, v15.4b[3]\n"
+ ".inst 0x6fb3eb3e // udot v30.4s, v25.16b, v19.4b[3]\n"
+ ".inst 0x6fb7eb3f // udot v31.4s, v25.16b, v23.4b[3]\n"
+ "9:\n"
"str q26, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q27, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q28, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q29, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q30, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q31, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
index d91416c3be..942f94b0bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4.hpp
@@ -31,10 +31,10 @@ namespace arm_gemm
{
// Actual kernel implementations
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-class smallK_hybrid_u8u32_dot_4x8
+class cls_a64_smallK_hybrid_u8u32_dot_8x4
{
public:
typedef uint8_t operand_type;
@@ -76,12 +76,12 @@ public:
StdTransformsFixed<operand_type, result_type, 8, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=a64_smallK_hybrid_u8u32_dot_4x8;
+ kern_type kernel=a64_smallK_hybrid_u8u32_dot_8x4;
- smallK_hybrid_u8u32_dot_4x8(const CPUInfo *ci)
+ cls_a64_smallK_hybrid_u8u32_dot_8x4(const CPUInfo *ci)
{
if (ci->get_cpu_model() == CPUModel::A55r1) {
- kernel = a64_smallK_hybrid_u8u32_dot_4x8_a55;
+ kernel = a64_smallK_hybrid_u8u32_dot_8x4_a55;
}
}
};
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
index e70fb6955e..fcb546f51e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/a55.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/a55.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4_a55(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -157,22 +157,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -181,55 +183,49 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q26, [c_ptr2]\n"
- "movi v26.4s, #0\n"
"add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ "movi v26.4s, #0\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
@@ -239,6 +235,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -268,23 +266,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -423,24 +432,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -456,78 +467,72 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "ins v16.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
"prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v17.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -565,23 +570,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -720,26 +744,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -762,95 +788,86 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v18.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -876,8 +893,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -893,23 +911,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1056,28 +1101,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1107,112 +1154,101 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "ins v16.d[1], temploadreg0\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ins v16.d[1], temploadreg0\n"
+ "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v19.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1235,8 +1271,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1260,23 +1297,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1363,26 +1435,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1415,30 +1495,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1475,126 +1557,113 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "ins v17.d[1], temploadreg1\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v20.d[1], temploadreg0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1617,8 +1686,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1650,23 +1720,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1753,34 +1866,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1813,32 +1934,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1882,146 +2005,132 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "ins v18.d[1], temploadreg2\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v19.d[1], temploadreg3\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v21.d[1], temploadreg1\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2038,7 +2147,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
@@ -2079,23 +2188,74 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2242,34 +2402,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2320,178 +2482,162 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "ins v19.d[1], temploadreg3\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v22.d[1], temploadreg2\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2539,23 +2685,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2643,6 +2848,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2651,7 +2857,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2662,15 +2867,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2678,7 +2892,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2711,36 +2924,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2798,192 +3013,248 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr d16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr temploadreg0, [%[b_ptr0], #0x8]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
- "ins v16.d[1], temploadreg0\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
- "ins v17.d[1], temploadreg1\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
- "ins v18.d[1], temploadreg2\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "ins v19.d[1], temploadreg3\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "ins v20.d[1], temploadreg0\n"
- "ins v21.d[1], temploadreg1\n"
- "ins v22.d[1], temploadreg2\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr d16, [%[b_ptr0]]\n"
"ldr temploadreg0, [%[b_ptr0], #0x8]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr d17, [%[b_ptr0], #0x10]\n"
"ldr temploadreg1, [%[b_ptr0], #0x18]\n"
- ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"str q26, [c_ptr2]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
"movi v26.4s, #0\n"
+ "ldr d18, [%[b_ptr0], #0x20]\n"
"ldr temploadreg2, [%[b_ptr0], #0x28]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
- ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"str q27, [c_ptr3]\n"
- "movi v27.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
- ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "movi v27.4s, #0\n"
+ "ldr d19, [%[b_ptr0], #0x30]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x38]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"str q28, [c_ptr4]\n"
- "movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
- ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "movi v28.4s, #0\n"
+ "ldr d20, [%[b_ptr0], #0x40]\n"
+ "ins v16.d[1], temploadreg0\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"str q29, [c_ptr5]\n"
- "movi v29.4s, #0\n"
"add c_ptr5, c_ptr5, #0x10\n"
- ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "movi v29.4s, #0\n"
+ "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr d21, [%[b_ptr0], #0x50]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ "ins v17.d[1], temploadreg1\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "ldr d22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ "ins v18.d[1], temploadreg2\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr d23, [%[b_ptr0], #0x70]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr d16, [%[b_ptr0]]\n"
+ "ins v19.d[1], temploadreg3\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "ins v16.d[1], temploadreg0\n"
+ "ins v20.d[1], temploadreg0\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "ldr temploadreg0, [%[b_ptr0], #0x48]\n"
+ "ins v21.d[1], temploadreg1\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "ins v22.d[1], temploadreg2\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "ins v23.d[1], temploadreg3\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr d17, [%[b_ptr0], #0x10]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
- "ins v17.d[1], temploadreg1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
- "ldr temploadreg1, [%[b_ptr0], #0x58]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr d18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
- "ins v18.d[1], temploadreg2\n"
".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
- "ldr temploadreg2, [%[b_ptr0], #0x68]\n"
".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr d19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
- "ins v19.d[1], temploadreg3\n"
".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
- "ldr temploadreg3, [%[b_ptr0], #0x78]\n"
".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr d20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
- "ins v20.d[1], temploadreg0\n"
".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr d21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
- "ins v21.d[1], temploadreg1\n"
".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr d22, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
- "ins v22.d[1], temploadreg2\n"
".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "ldr d23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ins v23.d[1], temploadreg3\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -3039,23 +3310,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8_a55(const uint8_t *A, int lda, const uint8_
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
index 2a7dd3d88d..aeea051662 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_4x8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/a64_smallK_hybrid_u8u32_dot_8x4/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void a64_smallK_hybrid_u8u32_dot_8x4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
const long loops_count = iceildiv(N, (int)4) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -153,22 +153,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[2], [a_ptr6]\n"
"ld1 {v7.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "movi v26.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "movi v27.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "movi v28.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "movi v29.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "movi v30.4s, #0\n"
"prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "movi v31.4s, #0\n"
"add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
@@ -177,20 +179,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -216,10 +215,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- "add %[b_ptr0], %[b_ptr0], #0x10\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -231,6 +229,8 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
@@ -260,23 +260,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -407,24 +418,26 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[6], [a_ptr6]\n"
"ld1 {v7.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
"movi v26.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
@@ -440,68 +453,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "add %[b_ptr0], %[b_ptr0], #0x20\n"
- "str q25, [c_ptr1]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "add %[b_ptr0], %[b_ptr0], #0x20\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
"b.ne 8b\n"
"7:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
+ "ldr q16, [%[b_ptr0]]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
"add %[b_ptr0], %[b_ptr0], #0x20\n"
"str q25, [c_ptr1]\n"
@@ -541,23 +552,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -688,26 +718,28 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[10], [a_ptr6]\n"
"ld1 {v7.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
"movi v27.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
@@ -730,49 +762,46 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -788,7 +817,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -802,11 +830,12 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "add %[b_ptr0], %[b_ptr0], #0x30\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
@@ -832,8 +861,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x30\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -849,23 +879,50 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1004,28 +1061,30 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v6.b}[14], [a_ptr6]\n"
"ld1 {v7.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
"movi v28.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
@@ -1055,50 +1114,47 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
@@ -1114,7 +1170,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
@@ -1123,7 +1178,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
@@ -1137,14 +1191,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "add %[b_ptr0], %[b_ptr0], #0x40\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1167,8 +1223,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x40\n"
".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
@@ -1192,23 +1249,58 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f81e219 // udot v25.4s, v16.16b, v1.4b[0]\n"
+ ".inst 0x6f82e21a // udot v26.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f83e21b // udot v27.4s, v16.16b, v3.4b[0]\n"
+ ".inst 0x6f84e21c // udot v28.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f85e21d // udot v29.4s, v16.16b, v5.4b[0]\n"
+ ".inst 0x6f86e21e // udot v30.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f87e21f // udot v31.4s, v16.16b, v7.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa1e239 // udot v25.4s, v17.16b, v1.4b[1]\n"
+ ".inst 0x6fa2e23a // udot v26.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa3e23b // udot v27.4s, v17.16b, v3.4b[1]\n"
+ ".inst 0x6fa4e23c // udot v28.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa5e23d // udot v29.4s, v17.16b, v5.4b[1]\n"
+ ".inst 0x6fa6e23e // udot v30.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa7e23f // udot v31.4s, v17.16b, v7.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f81ea59 // udot v25.4s, v18.16b, v1.4b[2]\n"
+ ".inst 0x6f82ea5a // udot v26.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f83ea5b // udot v27.4s, v18.16b, v3.4b[2]\n"
+ ".inst 0x6f84ea5c // udot v28.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f85ea5d // udot v29.4s, v18.16b, v5.4b[2]\n"
+ ".inst 0x6f86ea5e // udot v30.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f87ea5f // udot v31.4s, v18.16b, v7.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa1ea79 // udot v25.4s, v19.16b, v1.4b[3]\n"
+ ".inst 0x6fa2ea7a // udot v26.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa3ea7b // udot v27.4s, v19.16b, v3.4b[3]\n"
+ ".inst 0x6fa4ea7c // udot v28.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa5ea7d // udot v29.4s, v19.16b, v5.4b[3]\n"
+ ".inst 0x6fa6ea7e // udot v30.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa7ea7f // udot v31.4s, v19.16b, v7.4b[3]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1287,26 +1379,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr s1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s9, [a_ptr4]\n"
"ldr s11, [a_ptr5]\n"
"ldr s13, [a_ptr6]\n"
"ldr s15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
"subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
+ "ldr q6, [a_ptr3], #0x10\n"
+ "ldr q8, [a_ptr4], #0x10\n"
+ "ldr q10, [a_ptr5], #0x10\n"
+ "ldr q12, [a_ptr6], #0x10\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"b.ne 4f\n"
"ldr b1, [%[a_ptr0]]\n"
"ldr b3, [a_ptr1]\n"
@@ -1339,30 +1439,32 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[2], [a_ptr6]\n"
"ld1 {v15.b}[2], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
"movi v29.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
@@ -1399,51 +1501,48 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -1459,7 +1558,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1468,7 +1566,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1477,7 +1574,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1491,14 +1587,17 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "add %[b_ptr0], %[b_ptr0], #0x50\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
@@ -1521,8 +1620,9 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x50\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -1554,23 +1654,66 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1649,34 +1792,42 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
"ldr q6, [a_ptr3], #0x10\n"
- "ldr q8, [a_ptr4], #0x10\n"
- "ldr q10, [a_ptr5], #0x10\n"
- "ldr q12, [a_ptr6], #0x10\n"
- "ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr d1, [%[a_ptr0]]\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d3, [a_ptr1]\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d5, [a_ptr2]\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d7, [a_ptr3]\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d9, [a_ptr4]\n"
"ldr d11, [a_ptr5]\n"
"ldr d13, [a_ptr6]\n"
"ldr d15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr s1, [%[a_ptr0]], #0x4\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr s3, [a_ptr1], #0x4\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr s5, [a_ptr2], #0x4\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr s7, [a_ptr3], #0x4\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr s9, [a_ptr4], #0x4\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr s11, [a_ptr5], #0x4\n"
"ldr s13, [a_ptr6], #0x4\n"
"ldr s15, [a_ptr7], #0x4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[4], [%[a_ptr0]]\n"
"ld1 {v3.b}[4], [a_ptr1]\n"
@@ -1709,32 +1860,34 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[6], [a_ptr6]\n"
"ld1 {v15.b}[6], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
"movi v30.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
@@ -1778,68 +1931,64 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
- ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
@@ -1848,7 +1997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -1857,7 +2005,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -1866,7 +2013,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -1880,20 +2026,25 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "add %[b_ptr0], %[b_ptr0], #0x60\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -1910,7 +2061,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"movi v31.4s, #0\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x60\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
@@ -1951,23 +2102,74 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2106,34 +2308,36 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[10], [a_ptr6]\n"
"ld1 {v15.b}[10], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
"movi v31.4s, #0\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
@@ -2184,34 +2388,31 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
"add c_ptr3, c_ptr3, #0x10\n"
@@ -2230,24 +2431,23 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
@@ -2255,7 +2455,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2264,7 +2463,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2273,7 +2471,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2282,7 +2479,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2296,38 +2492,44 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
- "add %[b_ptr0], %[b_ptr0], #0x70\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
- "add c_ptr7, c_ptr7, #0x10\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ "add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x70\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2375,23 +2577,82 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "b 9f\n"
"6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2471,6 +2732,7 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
+ "cbnz %[odds], 2f\n"
"ldr q0, [%[a_ptr0]], #0x10\n"
"ldr q2, [a_ptr1], #0x10\n"
"ldr q4, [a_ptr2], #0x10\n"
@@ -2479,7 +2741,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ldr q10, [a_ptr5], #0x10\n"
"ldr q12, [a_ptr6], #0x10\n"
"ldr q14, [a_ptr7], #0x10\n"
- "cbnz %[odds], 2f\n"
"ldr q1, [%[a_ptr0]]\n"
"ldr q3, [a_ptr1]\n"
"ldr q5, [a_ptr2]\n"
@@ -2490,15 +2751,24 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ldr q15, [a_ptr7]\n"
"b 3f\n"
"2:\n"
+ "ldr q0, [%[a_ptr0]], #0x10\n"
+ "subs %[odds], %[odds], #0x1\n"
+ "ldr q2, [a_ptr1], #0x10\n"
+ "ldr q4, [a_ptr2], #0x10\n"
"ldr d1, [%[a_ptr0]], #0x8\n"
+ "ldr q6, [a_ptr3], #0x10\n"
"ldr d3, [a_ptr1], #0x8\n"
+ "ldr q8, [a_ptr4], #0x10\n"
"ldr d5, [a_ptr2], #0x8\n"
+ "ldr q10, [a_ptr5], #0x10\n"
"ldr d7, [a_ptr3], #0x8\n"
+ "ldr q12, [a_ptr6], #0x10\n"
"ldr d9, [a_ptr4], #0x8\n"
+ "ldr q14, [a_ptr7], #0x10\n"
"ldr d11, [a_ptr5], #0x8\n"
"ldr d13, [a_ptr6], #0x8\n"
- "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v1.s}[2], [%[a_ptr0]], #4\n"
+ "ldr d15, [a_ptr7], #0x8\n"
"ld1 {v3.s}[2], [a_ptr1], #4\n"
"ld1 {v5.s}[2], [a_ptr2], #4\n"
"ld1 {v7.s}[2], [a_ptr3], #4\n"
@@ -2506,7 +2776,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v11.s}[2], [a_ptr5], #4\n"
"ld1 {v13.s}[2], [a_ptr6], #4\n"
"ld1 {v15.s}[2], [a_ptr7], #4\n"
- "subs %[odds], %[odds], #0x1\n"
"b.ne 4f\n"
"ld1 {v1.b}[12], [%[a_ptr0]]\n"
"ld1 {v3.b}[12], [a_ptr1]\n"
@@ -2539,36 +2808,38 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"ld1 {v13.b}[14], [a_ptr6]\n"
"ld1 {v15.b}[14], [a_ptr7]\n"
"3:\n"
- "movi v24.4s, #0\n"
"ldr q16, [%[b_ptr0]]\n"
- "movi v25.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ldr q17, [%[b_ptr0], #0x10]\n"
- "movi v26.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ldr q18, [%[b_ptr0], #0x20]\n"
- "movi v27.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ldr q19, [%[b_ptr0], #0x30]\n"
- "movi v28.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ldr q20, [%[b_ptr0], #0x40]\n"
- "movi v29.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ldr q21, [%[b_ptr0], #0x50]\n"
- "movi v30.4s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ldr q22, [%[b_ptr0], #0x60]\n"
- "movi v31.4s, #0\n"
"ldr q23, [%[b_ptr0], #0x70]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "cbz %[loops], 6f\n"
+ "movi v24.4s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
@@ -2626,39 +2897,37 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "cbz %[loops], 6f\n"
- "ldr q16, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
"b.eq 7f\n"
"8:\n"
"str q24, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
- "str q25, [c_ptr1]\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
- "movi v25.4s, #0\n"
+ "str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
+ "movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
+ "add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr2, c_ptr2, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
"add c_ptr4, c_ptr4, #0x10\n"
@@ -2673,32 +2942,29 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
- "ldr q16, [%[b_ptr0]]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
- "ldr q17, [%[b_ptr0], #0x10]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
- "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
@@ -2707,7 +2973,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
- "ldr q19, [%[b_ptr0], #0x30]\n"
".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
@@ -2716,7 +2981,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
- "ldr q20, [%[b_ptr0], #0x40]\n"
".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
@@ -2725,7 +2989,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
- "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
@@ -2734,7 +2997,6 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
- "ldr q22, [%[b_ptr0], #0x60]\n"
".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
@@ -2748,38 +3010,119 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"movi v24.4s, #0\n"
- "ldr q23, [%[b_ptr0], #0x70]\n"
- "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ "ldr q16, [%[b_ptr0]]\n"
+ "ldr q17, [%[b_ptr0], #0x10]\n"
"str q25, [c_ptr1]\n"
"add c_ptr1, c_ptr1, #0x10\n"
"movi v25.4s, #0\n"
+ "ldr q18, [%[b_ptr0], #0x20]\n"
".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
"str q26, [c_ptr2]\n"
"movi v26.4s, #0\n"
+ "ldr q19, [%[b_ptr0], #0x30]\n"
+ "ldr q20, [%[b_ptr0], #0x40]\n"
"add c_ptr2, c_ptr2, #0x10\n"
".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
"str q27, [c_ptr3]\n"
"movi v27.4s, #0\n"
- "add c_ptr3, c_ptr3, #0x10\n"
+ "ldr q21, [%[b_ptr0], #0x50]\n"
".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ "ldr q22, [%[b_ptr0], #0x60]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
"str q28, [c_ptr4]\n"
"movi v28.4s, #0\n"
- "add c_ptr4, c_ptr4, #0x10\n"
+ "ldr q23, [%[b_ptr0], #0x70]\n"
".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ "add c_ptr3, c_ptr3, #0x10\n"
+ ".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
"str q29, [c_ptr5]\n"
"movi v29.4s, #0\n"
- "add c_ptr5, c_ptr5, #0x10\n"
+ "add c_ptr4, c_ptr4, #0x10\n"
".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
"str q30, [c_ptr6]\n"
"movi v30.4s, #0\n"
- "add c_ptr6, c_ptr6, #0x10\n"
+ "add c_ptr5, c_ptr5, #0x10\n"
".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
"str q31, [c_ptr7]\n"
"movi v31.4s, #0\n"
+ "add c_ptr6, c_ptr6, #0x10\n"
+ ".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
"add c_ptr7, c_ptr7, #0x10\n"
+ ".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ "add %[b_ptr0], %[b_ptr0], #0x80\n"
+ ".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
+ ".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
+ ".inst 0x6fa8e23c // udot v28.4s, v17.16b, v8.4b[1]\n"
+ ".inst 0x6faae23d // udot v29.4s, v17.16b, v10.4b[1]\n"
+ ".inst 0x6face23e // udot v30.4s, v17.16b, v12.4b[1]\n"
+ ".inst 0x6faee23f // udot v31.4s, v17.16b, v14.4b[1]\n"
+ ".inst 0x6f80ea58 // udot v24.4s, v18.16b, v0.4b[2]\n"
+ ".inst 0x6f82ea59 // udot v25.4s, v18.16b, v2.4b[2]\n"
+ ".inst 0x6f84ea5a // udot v26.4s, v18.16b, v4.4b[2]\n"
+ ".inst 0x6f86ea5b // udot v27.4s, v18.16b, v6.4b[2]\n"
+ ".inst 0x6f88ea5c // udot v28.4s, v18.16b, v8.4b[2]\n"
+ ".inst 0x6f8aea5d // udot v29.4s, v18.16b, v10.4b[2]\n"
+ ".inst 0x6f8cea5e // udot v30.4s, v18.16b, v12.4b[2]\n"
+ ".inst 0x6f8eea5f // udot v31.4s, v18.16b, v14.4b[2]\n"
+ ".inst 0x6fa0ea78 // udot v24.4s, v19.16b, v0.4b[3]\n"
+ ".inst 0x6fa2ea79 // udot v25.4s, v19.16b, v2.4b[3]\n"
+ ".inst 0x6fa4ea7a // udot v26.4s, v19.16b, v4.4b[3]\n"
+ ".inst 0x6fa6ea7b // udot v27.4s, v19.16b, v6.4b[3]\n"
+ ".inst 0x6fa8ea7c // udot v28.4s, v19.16b, v8.4b[3]\n"
+ ".inst 0x6faaea7d // udot v29.4s, v19.16b, v10.4b[3]\n"
+ ".inst 0x6facea7e // udot v30.4s, v19.16b, v12.4b[3]\n"
+ ".inst 0x6faeea7f // udot v31.4s, v19.16b, v14.4b[3]\n"
+ ".inst 0x6f81e298 // udot v24.4s, v20.16b, v1.4b[0]\n"
+ ".inst 0x6f83e299 // udot v25.4s, v20.16b, v3.4b[0]\n"
+ ".inst 0x6f85e29a // udot v26.4s, v20.16b, v5.4b[0]\n"
+ ".inst 0x6f87e29b // udot v27.4s, v20.16b, v7.4b[0]\n"
+ ".inst 0x6f89e29c // udot v28.4s, v20.16b, v9.4b[0]\n"
+ ".inst 0x6f8be29d // udot v29.4s, v20.16b, v11.4b[0]\n"
+ ".inst 0x6f8de29e // udot v30.4s, v20.16b, v13.4b[0]\n"
+ ".inst 0x6f8fe29f // udot v31.4s, v20.16b, v15.4b[0]\n"
+ ".inst 0x6fa1e2b8 // udot v24.4s, v21.16b, v1.4b[1]\n"
+ ".inst 0x6fa3e2b9 // udot v25.4s, v21.16b, v3.4b[1]\n"
+ ".inst 0x6fa5e2ba // udot v26.4s, v21.16b, v5.4b[1]\n"
+ ".inst 0x6fa7e2bb // udot v27.4s, v21.16b, v7.4b[1]\n"
+ ".inst 0x6fa9e2bc // udot v28.4s, v21.16b, v9.4b[1]\n"
+ ".inst 0x6fabe2bd // udot v29.4s, v21.16b, v11.4b[1]\n"
+ ".inst 0x6fade2be // udot v30.4s, v21.16b, v13.4b[1]\n"
+ ".inst 0x6fafe2bf // udot v31.4s, v21.16b, v15.4b[1]\n"
+ ".inst 0x6f81ead8 // udot v24.4s, v22.16b, v1.4b[2]\n"
+ ".inst 0x6f83ead9 // udot v25.4s, v22.16b, v3.4b[2]\n"
+ ".inst 0x6f85eada // udot v26.4s, v22.16b, v5.4b[2]\n"
+ ".inst 0x6f87eadb // udot v27.4s, v22.16b, v7.4b[2]\n"
+ ".inst 0x6f89eadc // udot v28.4s, v22.16b, v9.4b[2]\n"
+ ".inst 0x6f8beadd // udot v29.4s, v22.16b, v11.4b[2]\n"
+ ".inst 0x6f8deade // udot v30.4s, v22.16b, v13.4b[2]\n"
+ ".inst 0x6f8feadf // udot v31.4s, v22.16b, v15.4b[2]\n"
+ ".inst 0x6fa1eaf8 // udot v24.4s, v23.16b, v1.4b[3]\n"
+ ".inst 0x6fa3eaf9 // udot v25.4s, v23.16b, v3.4b[3]\n"
+ ".inst 0x6fa5eafa // udot v26.4s, v23.16b, v5.4b[3]\n"
+ ".inst 0x6fa7eafb // udot v27.4s, v23.16b, v7.4b[3]\n"
+ ".inst 0x6fa9eafc // udot v28.4s, v23.16b, v9.4b[3]\n"
+ ".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
+ ".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
+ ".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
+ "b 9f\n"
+ "6:\n"
+ "movi v24.4s, #0\n"
+ "movi v25.4s, #0\n"
+ "movi v26.4s, #0\n"
+ "movi v27.4s, #0\n"
+ "movi v28.4s, #0\n"
+ "movi v29.4s, #0\n"
+ "movi v30.4s, #0\n"
+ "movi v31.4s, #0\n"
+ ".inst 0x6f80e218 // udot v24.4s, v16.16b, v0.4b[0]\n"
+ ".inst 0x6f82e219 // udot v25.4s, v16.16b, v2.4b[0]\n"
+ ".inst 0x6f84e21a // udot v26.4s, v16.16b, v4.4b[0]\n"
+ ".inst 0x6f86e21b // udot v27.4s, v16.16b, v6.4b[0]\n"
+ ".inst 0x6f88e21c // udot v28.4s, v16.16b, v8.4b[0]\n"
+ ".inst 0x6f8ae21d // udot v29.4s, v16.16b, v10.4b[0]\n"
".inst 0x6f8ce21e // udot v30.4s, v16.16b, v12.4b[0]\n"
- ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6f8ee21f // udot v31.4s, v16.16b, v14.4b[0]\n"
+ ".inst 0x6fa0e238 // udot v24.4s, v17.16b, v0.4b[1]\n"
".inst 0x6fa2e239 // udot v25.4s, v17.16b, v2.4b[1]\n"
".inst 0x6fa4e23a // udot v26.4s, v17.16b, v4.4b[1]\n"
".inst 0x6fa6e23b // udot v27.4s, v17.16b, v6.4b[1]\n"
@@ -2835,23 +3178,16 @@ void a64_smallK_hybrid_u8u32_dot_4x8(const uint8_t *A, int lda, const uint8_t *B
".inst 0x6fabeafd // udot v29.4s, v23.16b, v11.4b[3]\n"
".inst 0x6fadeafe // udot v30.4s, v23.16b, v13.4b[3]\n"
".inst 0x6fafeaff // udot v31.4s, v23.16b, v15.4b[3]\n"
- "6:\n"
+ "9:\n"
"str q24, [%[c_ptr0]]\n"
"add %[c_ptr0], %[c_ptr0], #0x10\n"
"str q25, [c_ptr1]\n"
- "add c_ptr1, c_ptr1, #0x10\n"
"str q26, [c_ptr2]\n"
- "add c_ptr2, c_ptr2, #0x10\n"
"str q27, [c_ptr3]\n"
- "add c_ptr3, c_ptr3, #0x10\n"
"str q28, [c_ptr4]\n"
- "add c_ptr4, c_ptr4, #0x10\n"
"str q29, [c_ptr5]\n"
- "add c_ptr5, c_ptr5, #0x10\n"
"str q30, [c_ptr6]\n"
- "add c_ptr6, c_ptr6, #0x10\n"
"str q31, [c_ptr7]\n"
- "add c_ptr7, c_ptr7, #0x10\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
index 1bc8021e76..57fd9c909e 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL.hpp
@@ -23,34 +23,28 @@
*/
#pragma once
-#ifdef __ARM_FEATURE_SVE
-
+#ifdef __aarch64__
+#include "../performance_parameters.hpp"
#include "../std_transforms_sve.hpp"
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_fp32_mla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_gemv_fp32_mla_8VL(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
-class hybrid_fp32_mla_4VLx4
+class cls_sve_gemv_fp32_mla_8VL
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
+ typedef void (*kern_type)(const float *, const float *, float *, size_t, size_t, const float *, Activation, bool);
static unsigned int out_width()
{
- return get_vector_length<float>() * 4;
+ return 8 * get_vector_length<float>();
}
static constexpr unsigned int k_unroll()
@@ -60,7 +54,7 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
+ return false;
}
static constexpr bool supports_bias()
@@ -73,17 +67,16 @@ public:
return true;
}
- StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 1, 8, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_fp32_mla_4VLx4;
+ kern_type kernel=sve_gemv_fp32_mla_8VL;
- hybrid_fp32_mla_4VLx4(const CPUInfo *)
+ cls_sve_gemv_fp32_mla_8VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
-#endif // __ARM_FEATURE_SVE
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
new file mode 100644
index 0000000000..c62e31936c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_gemv_fp32_mla_8VL/generic.cpp
@@ -0,0 +1,1372 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_gemv_fp32_mla_8VL (
+ const float *A_ptr, const float *B_ptr, float *output_ptr,
+ size_t N, size_t K,
+ const float *bias, Activation act, bool
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ unsigned int input_initial_col = {};
+ } ka;
+
+ unsigned long flags=0;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "cntw x24\n"
+ "add x23, %x[N], x24\n"
+ "sub x23, x23, #0x1\n"
+ "udiv x23, x23, x24\n"
+ "mov x22, %x[bias]\n"
+ "1:" // Column loop
+ "cmp x23, #0x8\n"
+ "bge 50f\n"
+ "cmp x23, #0x6\n"
+ "bgt 43f\n"
+ "beq 36f\n"
+ "cmp x23, #0x4\n"
+ "bgt 29f\n"
+ "beq 22f\n"
+ "cmp x23, #0x2\n"
+ "bgt 15f\n"
+ "beq 8f\n"
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "whilelt p1.s, XZR, %x[N]\n"
+ "cbz x22, 2f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "addvl x22, x22, #1\n"
+ "b 3f\n"
+ "2:" // Width 1: no bias
+ "mov z24.b, #0x0\n"
+ "3:" // Width 1: setup done
+ "cmp x21, #0x4\n"
+ "ble 5f\n"
+ "4:" // Width 1: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x20, x20, #0x10\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z2.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "cmp x21, #0x4\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z4.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 4b\n"
+ "5:" // Width 1: Multiply loop: Single iteration only
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z5.s, z0.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "subs x21, x21, #0x1\n"
+ "ble 6f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z6.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ble 6f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ble 6f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "6:" // Width 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 7f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "7:" // Width 1: No activation
+ "st1w { z24.s }, p1, [%x[output_ptr]]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #1\n"
+ "b 57f\n"
+ "8:" // Width 2
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "sub x19, %x[N], x24\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 9f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "addvl x22, x22, #2\n"
+ "b 10f\n"
+ "9:" // Width 2: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "10:" // Width 2: setup done
+ "cmp x21, #0x4\n"
+ "ble 12f\n"
+ "11:" // Width 2: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[1]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z4.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "cmp x21, #0x4\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z5.s, z0.s[2]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z6.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[3]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 11b\n"
+ "12:" // Width 2: Multiply loop: Single iteration only
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z9.s, z0.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z10.s, z0.s[0]\n"
+ "subs x21, x21, #0x1\n"
+ "ble 13f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z11.s, z0.s[1]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z12.s, z0.s[1]\n"
+ "ble 13f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[2]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z14.s, z0.s[2]\n"
+ "ble 13f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z25.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "13:" // Width 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 14f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "14:" // Width 2: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p1, [%x[output_ptr], #1, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #2\n"
+ "b 57f\n"
+ "15:" // Width 3
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x2\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 16f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "addvl x22, x22, #3\n"
+ "b 17f\n"
+ "16:" // Width 3: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "17:" // Width 3: setup done
+ "cmp x21, #0x4\n"
+ "ble 19f\n"
+ "18:" // Width 3: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "cmp x21, #0x4\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z4.s, z0.s[1]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z5.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z6.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[2]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z8.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z9.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z10.s, z0.s[3]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z11.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z12.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 18b\n"
+ "19:" // Width 3: Multiply loop: Single iteration only
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z13.s, z0.s[0]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z14.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z15.s, z0.s[0]\n"
+ "ble 20f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z16.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z17.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z18.s, z0.s[1]\n"
+ "ble 20f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z19.s, z0.s[2]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z20.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z21.s, z0.s[2]\n"
+ "ble 20f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z22.s, z0.s[3]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z23.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z26.s, z1.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "20:" // Width 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 21f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "21:" // Width 3: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p1, [%x[output_ptr], #2, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #3\n"
+ "b 57f\n"
+ "22:" // Width 4
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x3\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 23f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "addvl x22, x22, #4\n"
+ "b 24f\n"
+ "23:" // Width 4: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "24:" // Width 4: setup done
+ "cmp x21, #0x4\n"
+ "ble 26f\n"
+ "25:" // Width 4: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "cmp x21, #0x4\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z5.s, z0.s[1]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z6.s, z0.s[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z7.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z8.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z9.s, z0.s[2]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z10.s, z0.s[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z11.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z12.s, z0.s[2]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[3]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z15.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 25b\n"
+ "26:" // Width 4: Multiply loop: Single iteration only
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z17.s, z0.s[0]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z18.s, z0.s[0]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z19.s, z0.s[0]\n"
+ "fmla z27.s, z20.s, z0.s[0]\n"
+ "ble 27f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z21.s, z0.s[1]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z22.s, z0.s[1]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z23.s, z0.s[1]\n"
+ "fmla z27.s, z1.s, z0.s[1]\n"
+ "ble 27f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z2.s, z0.s[2]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z3.s, z0.s[2]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z4.s, z0.s[2]\n"
+ "fmla z27.s, z5.s, z0.s[2]\n"
+ "ble 27f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z7.s, z0.s[3]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z26.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z27.s, z9.s, z0.s[3]\n"
+ "27:" // Width 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 28f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "28:" // Width 4: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [%x[output_ptr], #3, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #4\n"
+ "b 57f\n"
+ "29:" // Width 5
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x4\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 30f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "addvl x22, x22, #5\n"
+ "b 31f\n"
+ "30:" // Width 5: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "31:" // Width 5: setup done
+ "cmp x21, #0x4\n"
+ "ble 33f\n"
+ "32:" // Width 5: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z6.s, z0.s[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z7.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z8.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z27.s, z9.s, z0.s[1]\n"
+ "fmla z28.s, z10.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z11.s, z0.s[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z12.s, z0.s[2]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z13.s, z0.s[2]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z14.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z15.s, z0.s[2]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z16.s, z0.s[3]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z17.s, z0.s[3]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z18.s, z0.s[3]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z19.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z20.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 32b\n"
+ "33:" // Width 5: Multiply loop: Single iteration only
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z21.s, z0.s[0]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z22.s, z0.s[0]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z23.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z1.s, z0.s[0]\n"
+ "fmla z28.s, z2.s, z0.s[0]\n"
+ "ble 34f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[1]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z4.s, z0.s[1]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z5.s, z0.s[1]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z6.s, z0.s[1]\n"
+ "fmla z28.s, z7.s, z0.s[1]\n"
+ "ble 34f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[2]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z9.s, z0.s[2]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z10.s, z0.s[2]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z11.s, z0.s[2]\n"
+ "fmla z28.s, z12.s, z0.s[2]\n"
+ "ble 34f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[3]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[3]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z15.s, z0.s[3]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z28.s, z17.s, z0.s[3]\n"
+ "34:" // Width 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 35f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "35:" // Width 5: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p1, [%x[output_ptr], #4, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #5\n"
+ "b 57f\n"
+ "36:" // Width 6
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x5\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 37f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+ "addvl x22, x22, #6\n"
+ "b 38f\n"
+ "37:" // Width 6: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "38:" // Width 6: setup done
+ "cmp x21, #0x4\n"
+ "ble 40f\n"
+ "39:" // Width 6: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z7.s, z0.s[1]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z8.s, z0.s[1]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z10.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z11.s, z0.s[1]\n"
+ "fmla z29.s, z12.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[2]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[2]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z15.s, z0.s[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z16.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z17.s, z0.s[2]\n"
+ "fmla z29.s, z18.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z19.s, z0.s[3]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z20.s, z0.s[3]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z21.s, z0.s[3]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z22.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z23.s, z0.s[3]\n"
+ "fmla z29.s, z1.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 39b\n"
+ "40:" // Width 6: Multiply loop: Single iteration only
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z4.s, z0.s[0]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z27.s, z5.s, z0.s[0]\n"
+ "fmla z28.s, z6.s, z0.s[0]\n"
+ "fmla z29.s, z7.s, z0.s[0]\n"
+ "ble 41f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z10.s, z0.s[1]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z11.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z12.s, z0.s[1]\n"
+ "fmla z29.s, z13.s, z0.s[1]\n"
+ "ble 41f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z15.s, z0.s[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z16.s, z0.s[2]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z17.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z18.s, z0.s[2]\n"
+ "fmla z29.s, z19.s, z0.s[2]\n"
+ "ble 41f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z20.s, z0.s[3]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z21.s, z0.s[3]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z22.s, z0.s[3]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z23.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z28.s, z1.s, z0.s[3]\n"
+ "fmla z29.s, z2.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "41:" // Width 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 42f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "42:" // Width 6: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+ "st1w { z29.s }, p1, [%x[output_ptr], #5, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #6\n"
+ "b 57f\n"
+ "43:" // Width 7
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x6\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 44f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+ "addvl x22, x22, #7\n"
+ "b 45f\n"
+ "44:" // Width 7: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "45:" // Width 7: setup done
+ "cmp x21, #0x4\n"
+ "ble 47f\n"
+ "46:" // Width 7: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z6.s, z0.s[0]\n"
+ "fmla z30.s, z7.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z8.s, z0.s[1]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z10.s, z0.s[1]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z11.s, z0.s[1]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z12.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z13.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z30.s, z14.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z15.s, z0.s[2]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z16.s, z0.s[2]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z17.s, z0.s[2]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z18.s, z0.s[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z19.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z29.s, z20.s, z0.s[2]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z30.s, z21.s, z0.s[2]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z22.s, z0.s[3]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z23.s, z0.s[3]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z1.s, z0.s[3]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z2.s, z0.s[3]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z3.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z29.s, z4.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z30.s, z5.s, z0.s[3]\n"
+ "bgt 46b\n"
+ "47:" // Width 7: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z6.s, z0.s[0]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z7.s, z0.s[0]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z8.s, z0.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z27.s, z9.s, z0.s[0]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z10.s, z0.s[0]\n"
+ "fmla z29.s, z11.s, z0.s[0]\n"
+ "fmla z30.s, z12.s, z0.s[0]\n"
+ "ble 48f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z14.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z15.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z16.s, z0.s[1]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z17.s, z0.s[1]\n"
+ "fmla z29.s, z18.s, z0.s[1]\n"
+ "fmla z30.s, z19.s, z0.s[1]\n"
+ "ble 48f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z20.s, z0.s[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z21.s, z0.s[2]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z22.s, z0.s[2]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z23.s, z0.s[2]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z1.s, z0.s[2]\n"
+ "fmla z29.s, z2.s, z0.s[2]\n"
+ "fmla z30.s, z3.s, z0.s[2]\n"
+ "ble 48f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z4.s, z0.s[3]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z5.s, z0.s[3]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z6.s, z0.s[3]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z7.s, z0.s[3]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z28.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z29.s, z9.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z10.s, z0.s[3]\n"
+ "48:" // Width 7: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 49f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "49:" // Width 7: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+ "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+ "st1w { z30.s }, p1, [%x[output_ptr], #6, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #7\n"
+ "b 57f\n"
+ "50:" // Width 8
+ "mov x21, %x[K]\n"
+ "mov x20, %x[A_ptr]\n"
+ "mov x19, #0x7\n"
+ "msub x19, x24, x19, %x[N]\n"
+ "whilelt p1.s, XZR, x19\n"
+ "cbz x22, 51f\n"
+ "ld1w { z24.s }, p2/Z, [x22]\n"
+ "ld1w { z25.s }, p2/Z, [x22, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x22, #2, MUL VL]\n"
+ "ld1w { z27.s }, p2/Z, [x22, #3, MUL VL]\n"
+ "ld1w { z28.s }, p2/Z, [x22, #4, MUL VL]\n"
+ "ld1w { z29.s }, p2/Z, [x22, #5, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x22, #6, MUL VL]\n"
+ "ld1w { z31.s }, p2/Z, [x22, #7, MUL VL]\n"
+ "addvl x22, x22, #8\n"
+ "b 52f\n"
+ "51:" // Width 8: no bias
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "52:" // Width 8: setup done
+ "cmp x21, #0x4\n"
+ "ble 54f\n"
+ "53:" // Width 8: Multiply loop: Main loop head
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "sub x21, x21, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z1.s, z0.s[0]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z2.s, z0.s[0]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "cmp x21, #0x4\n"
+ "fmla z26.s, z3.s, z0.s[0]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z4.s, z0.s[0]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z5.s, z0.s[0]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "fmla z30.s, z7.s, z0.s[0]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z31.s, z8.s, z0.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z10.s, z0.s[1]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z11.s, z0.s[1]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z12.s, z0.s[1]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z13.s, z0.s[1]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z14.s, z0.s[1]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z15.s, z0.s[1]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z31.s, z16.s, z0.s[1]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z17.s, z0.s[2]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z18.s, z0.s[2]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z19.s, z0.s[2]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z20.s, z0.s[2]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z21.s, z0.s[2]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z22.s, z0.s[2]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z23.s, z0.s[2]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z31.s, z1.s, z0.s[2]\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "fmla z24.s, z2.s, z0.s[3]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z25.s, z3.s, z0.s[3]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z4.s, z0.s[3]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z5.s, z0.s[3]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z28.s, z6.s, z0.s[3]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z7.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z8.s, z0.s[3]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z31.s, z9.s, z0.s[3]\n"
+ "bgt 53b\n"
+ "54:" // Width 8: Multiply loop: Single iteration only
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr]]\n"
+ "whilelt p0.s, XZR, x21\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x20]\n"
+ "fmla z24.s, z10.s, z0.s[0]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z25.s, z11.s, z0.s[0]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "fmla z26.s, z12.s, z0.s[0]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "fmla z27.s, z13.s, z0.s[0]\n"
+ "fmla z28.s, z14.s, z0.s[0]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z15.s, z0.s[0]\n"
+ "fmla z30.s, z16.s, z0.s[0]\n"
+ "fmla z31.s, z17.s, z0.s[0]\n"
+ "ble 55f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z18.s, z0.s[1]\n"
+ "ld1w { z19.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z20.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z19.s, z0.s[1]\n"
+ "ld1w { z21.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z20.s, z0.s[1]\n"
+ "ld1w { z22.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z23.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z21.s, z0.s[1]\n"
+ "ld1w { z1.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "fmla z28.s, z22.s, z0.s[1]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z23.s, z0.s[1]\n"
+ "fmla z30.s, z1.s, z0.s[1]\n"
+ "fmla z31.s, z2.s, z0.s[1]\n"
+ "ble 55f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "subs x21, x21, #0x1\n"
+ "ld1w { z3.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z3.s, z0.s[2]\n"
+ "ld1w { z4.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z4.s, z0.s[2]\n"
+ "ld1w { z6.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z5.s, z0.s[2]\n"
+ "ld1w { z7.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z8.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z6.s, z0.s[2]\n"
+ "ld1w { z9.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "fmla z28.s, z7.s, z0.s[2]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z8.s, z0.s[2]\n"
+ "fmla z30.s, z9.s, z0.s[2]\n"
+ "fmla z31.s, z10.s, z0.s[2]\n"
+ "ble 55f\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "ld1w { z11.s }, p2/Z, [%x[B_ptr]]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "ld1w { z12.s }, p2/Z, [%x[B_ptr], #1, MUL VL]\n"
+ "ld1w { z13.s }, p2/Z, [%x[B_ptr], #2, MUL VL]\n"
+ "fmla z25.s, z12.s, z0.s[3]\n"
+ "ld1w { z14.s }, p2/Z, [%x[B_ptr], #3, MUL VL]\n"
+ "fmla z26.s, z13.s, z0.s[3]\n"
+ "ld1w { z15.s }, p2/Z, [%x[B_ptr], #4, MUL VL]\n"
+ "ld1w { z16.s }, p2/Z, [%x[B_ptr], #5, MUL VL]\n"
+ "fmla z27.s, z14.s, z0.s[3]\n"
+ "ld1w { z17.s }, p2/Z, [%x[B_ptr], #6, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [%x[B_ptr], #7, MUL VL]\n"
+ "fmla z28.s, z15.s, z0.s[3]\n"
+ "addvl %x[B_ptr], %x[B_ptr], #8\n"
+ "fmla z29.s, z16.s, z0.s[3]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x400]\n"
+ "prfm pldl1keep, [%x[B_ptr], #0x440]\n"
+ "fmla z30.s, z17.s, z0.s[3]\n"
+ "fmla z31.s, z18.s, z0.s[3]\n"
+ "55:" // Width 8: Multiply loop: multiply skip
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "prfm pstl1keep, [%x[output_ptr], #0x0]\n"
+ "tbz %x[flags], #1, 56f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmin z31.s, p2/M, z31.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "fmax z31.s, p2/M, z31.s, z17.s\n"
+ "56:" // Width 8: No activation
+ "st1w { z24.s }, p2, [%x[output_ptr]]\n"
+ "subs x23, x23, #0x8\n"
+ "st1w { z25.s }, p2, [%x[output_ptr], #1, MUL VL]\n"
+ "sub %x[N], %x[N], x24, LSL #3\n"
+ "st1w { z26.s }, p2, [%x[output_ptr], #2, MUL VL]\n"
+ "st1w { z27.s }, p2, [%x[output_ptr], #3, MUL VL]\n"
+ "st1w { z28.s }, p2, [%x[output_ptr], #4, MUL VL]\n"
+ "st1w { z29.s }, p2, [%x[output_ptr], #5, MUL VL]\n"
+ "st1w { z30.s }, p2, [%x[output_ptr], #6, MUL VL]\n"
+ "st1w { z31.s }, p1, [%x[output_ptr], #7, MUL VL]\n"
+ "addvl %x[output_ptr], %x[output_ptr], #8\n"
+ "bgt 1b\n"
+ "57:" // Exit
+
+ : [B_ptr] "+r" (B_ptr), [N] "+r" (N), [output_ptr] "+r" (output_ptr)
+ : [A_ptr] "r" (A_ptr), [K] "r" (K), [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval))
+ : "cc", "memory", "p0", "p1", "p2", "x19", "x20", "x21", "x22", "x23", "x24", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 385a16fe10..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2247 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 1) / 2) * 2;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 1) / 2;
- float nullbias[256];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z23.d, z19.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z24.d, z16.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "mov z24.d, z16.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z25.d, z17.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z28.d, z16.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z29.d, z17.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z30.d, z18.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z31.d, z19.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "b 5f\n"
- "4:\n"
- ".inst 0x64604110 // bfdot z16.s, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x64614114 // bfdot z20.s, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64624118 // bfdot z24.s, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6463411c // bfdot z28.s, z8.h, z3.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64604131 // bfdot z17.s, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x64614135 // bfdot z21.s, z9.h, z1.h[0]\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x64624139 // bfdot z25.s, z9.h, z2.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6463413d // bfdot z29.s, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64604152 // bfdot z18.s, z10.h, z0.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64614156 // bfdot z22.s, z10.h, z1.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6462415a // bfdot z26.s, z10.h, z2.h[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6463415e // bfdot z30.s, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64604173 // bfdot z19.s, z11.h, z0.h[0]\n"
- ".inst 0x64614177 // bfdot z23.s, z11.h, z1.h[0]\n"
- ".inst 0x6462417b // bfdot z27.s, z11.h, z2.h[0]\n"
- ".inst 0x6463417f // bfdot z31.s, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64684190 // bfdot z16.s, z12.h, z0.h[1]\n"
- ".inst 0x64694194 // bfdot z20.s, z12.h, z1.h[1]\n"
- ".inst 0x646a4198 // bfdot z24.s, z12.h, z2.h[1]\n"
- ".inst 0x646b419c // bfdot z28.s, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x646841b1 // bfdot z17.s, z13.h, z0.h[1]\n"
- ".inst 0x646941b5 // bfdot z21.s, z13.h, z1.h[1]\n"
- ".inst 0x646a41b9 // bfdot z25.s, z13.h, z2.h[1]\n"
- ".inst 0x646b41bd // bfdot z29.s, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x646841d2 // bfdot z18.s, z14.h, z0.h[1]\n"
- ".inst 0x646941d6 // bfdot z22.s, z14.h, z1.h[1]\n"
- ".inst 0x646a41da // bfdot z26.s, z14.h, z2.h[1]\n"
- ".inst 0x646b41de // bfdot z30.s, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646841f3 // bfdot z19.s, z15.h, z0.h[1]\n"
- ".inst 0x646941f7 // bfdot z23.s, z15.h, z1.h[1]\n"
- ".inst 0x646a41fb // bfdot z27.s, z15.h, z2.h[1]\n"
- ".inst 0x646b41ff // bfdot z31.s, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64704110 // bfdot z16.s, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x64714114 // bfdot z20.s, z8.h, z1.h[2]\n"
- ".inst 0x64724118 // bfdot z24.s, z8.h, z2.h[2]\n"
- ".inst 0x6473411c // bfdot z28.s, z8.h, z3.h[2]\n"
- ".inst 0x64704131 // bfdot z17.s, z9.h, z0.h[2]\n"
- ".inst 0x64714135 // bfdot z21.s, z9.h, z1.h[2]\n"
- ".inst 0x64724139 // bfdot z25.s, z9.h, z2.h[2]\n"
- ".inst 0x6473413d // bfdot z29.s, z9.h, z3.h[2]\n"
- ".inst 0x64704152 // bfdot z18.s, z10.h, z0.h[2]\n"
- ".inst 0x64714156 // bfdot z22.s, z10.h, z1.h[2]\n"
- ".inst 0x6472415a // bfdot z26.s, z10.h, z2.h[2]\n"
- ".inst 0x6473415e // bfdot z30.s, z10.h, z3.h[2]\n"
- ".inst 0x64704173 // bfdot z19.s, z11.h, z0.h[2]\n"
- ".inst 0x64714177 // bfdot z23.s, z11.h, z1.h[2]\n"
- ".inst 0x6472417b // bfdot z27.s, z11.h, z2.h[2]\n"
- ".inst 0x6473417f // bfdot z31.s, z11.h, z3.h[2]\n"
- ".inst 0x64784190 // bfdot z16.s, z12.h, z0.h[3]\n"
- ".inst 0x64794194 // bfdot z20.s, z12.h, z1.h[3]\n"
- ".inst 0x647a4198 // bfdot z24.s, z12.h, z2.h[3]\n"
- ".inst 0x647b419c // bfdot z28.s, z12.h, z3.h[3]\n"
- ".inst 0x647841b1 // bfdot z17.s, z13.h, z0.h[3]\n"
- ".inst 0x647941b5 // bfdot z21.s, z13.h, z1.h[3]\n"
- ".inst 0x647a41b9 // bfdot z25.s, z13.h, z2.h[3]\n"
- ".inst 0x647b41bd // bfdot z29.s, z13.h, z3.h[3]\n"
- ".inst 0x647841d2 // bfdot z18.s, z14.h, z0.h[3]\n"
- ".inst 0x647941d6 // bfdot z22.s, z14.h, z1.h[3]\n"
- ".inst 0x647a41da // bfdot z26.s, z14.h, z2.h[3]\n"
- ".inst 0x647b41de // bfdot z30.s, z14.h, z3.h[3]\n"
- ".inst 0x647841f3 // bfdot z19.s, z15.h, z0.h[3]\n"
- ".inst 0x647941f7 // bfdot z23.s, z15.h, z1.h[3]\n"
- ".inst 0x647a41fb // bfdot z27.s, z15.h, z2.h[3]\n"
- ".inst 0x647b41ff // bfdot z31.s, z15.h, z3.h[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64644110 // bfdot z16.s, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64654114 // bfdot z20.s, z8.h, z5.h[0]\n"
- ".inst 0x64664118 // bfdot z24.s, z8.h, z6.h[0]\n"
- ".inst 0x6467411c // bfdot z28.s, z8.h, z7.h[0]\n"
- ".inst 0x64644131 // bfdot z17.s, z9.h, z4.h[0]\n"
- ".inst 0x64654135 // bfdot z21.s, z9.h, z5.h[0]\n"
- ".inst 0x64664139 // bfdot z25.s, z9.h, z6.h[0]\n"
- ".inst 0x6467413d // bfdot z29.s, z9.h, z7.h[0]\n"
- ".inst 0x64644152 // bfdot z18.s, z10.h, z4.h[0]\n"
- ".inst 0x64654156 // bfdot z22.s, z10.h, z5.h[0]\n"
- ".inst 0x6466415a // bfdot z26.s, z10.h, z6.h[0]\n"
- ".inst 0x6467415e // bfdot z30.s, z10.h, z7.h[0]\n"
- ".inst 0x64644173 // bfdot z19.s, z11.h, z4.h[0]\n"
- ".inst 0x64654177 // bfdot z23.s, z11.h, z5.h[0]\n"
- ".inst 0x6466417b // bfdot z27.s, z11.h, z6.h[0]\n"
- ".inst 0x6467417f // bfdot z31.s, z11.h, z7.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x646c4190 // bfdot z16.s, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x646d4194 // bfdot z20.s, z12.h, z5.h[1]\n"
- ".inst 0x646e4198 // bfdot z24.s, z12.h, z6.h[1]\n"
- ".inst 0x646f419c // bfdot z28.s, z12.h, z7.h[1]\n"
- ".inst 0x646c41b1 // bfdot z17.s, z13.h, z4.h[1]\n"
- ".inst 0x646d41b5 // bfdot z21.s, z13.h, z5.h[1]\n"
- ".inst 0x646e41b9 // bfdot z25.s, z13.h, z6.h[1]\n"
- ".inst 0x646f41bd // bfdot z29.s, z13.h, z7.h[1]\n"
- ".inst 0x646c41d2 // bfdot z18.s, z14.h, z4.h[1]\n"
- ".inst 0x646d41d6 // bfdot z22.s, z14.h, z5.h[1]\n"
- ".inst 0x646e41da // bfdot z26.s, z14.h, z6.h[1]\n"
- ".inst 0x646f41de // bfdot z30.s, z14.h, z7.h[1]\n"
- ".inst 0x646c41f3 // bfdot z19.s, z15.h, z4.h[1]\n"
- ".inst 0x646d41f7 // bfdot z23.s, z15.h, z5.h[1]\n"
- ".inst 0x646e41fb // bfdot z27.s, z15.h, z6.h[1]\n"
- ".inst 0x646f41ff // bfdot z31.s, z15.h, z7.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64744110 // bfdot z16.s, z8.h, z4.h[2]\n"
- ".inst 0x64754114 // bfdot z20.s, z8.h, z5.h[2]\n"
- ".inst 0x64764118 // bfdot z24.s, z8.h, z6.h[2]\n"
- ".inst 0x6477411c // bfdot z28.s, z8.h, z7.h[2]\n"
- ".inst 0x64744131 // bfdot z17.s, z9.h, z4.h[2]\n"
- ".inst 0x64754135 // bfdot z21.s, z9.h, z5.h[2]\n"
- ".inst 0x64764139 // bfdot z25.s, z9.h, z6.h[2]\n"
- ".inst 0x6477413d // bfdot z29.s, z9.h, z7.h[2]\n"
- ".inst 0x64744152 // bfdot z18.s, z10.h, z4.h[2]\n"
- ".inst 0x64754156 // bfdot z22.s, z10.h, z5.h[2]\n"
- ".inst 0x6476415a // bfdot z26.s, z10.h, z6.h[2]\n"
- ".inst 0x6477415e // bfdot z30.s, z10.h, z7.h[2]\n"
- ".inst 0x64744173 // bfdot z19.s, z11.h, z4.h[2]\n"
- ".inst 0x64754177 // bfdot z23.s, z11.h, z5.h[2]\n"
- ".inst 0x6476417b // bfdot z27.s, z11.h, z6.h[2]\n"
- ".inst 0x6477417f // bfdot z31.s, z11.h, z7.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x647c4190 // bfdot z16.s, z12.h, z4.h[3]\n"
- ".inst 0x647d4194 // bfdot z20.s, z12.h, z5.h[3]\n"
- ".inst 0x647e4198 // bfdot z24.s, z12.h, z6.h[3]\n"
- ".inst 0x647f419c // bfdot z28.s, z12.h, z7.h[3]\n"
- ".inst 0x647c41b1 // bfdot z17.s, z13.h, z4.h[3]\n"
- ".inst 0x647d41b5 // bfdot z21.s, z13.h, z5.h[3]\n"
- ".inst 0x647e41b9 // bfdot z25.s, z13.h, z6.h[3]\n"
- ".inst 0x647f41bd // bfdot z29.s, z13.h, z7.h[3]\n"
- ".inst 0x647c41d2 // bfdot z18.s, z14.h, z4.h[3]\n"
- ".inst 0x647d41d6 // bfdot z22.s, z14.h, z5.h[3]\n"
- ".inst 0x647e41da // bfdot z26.s, z14.h, z6.h[3]\n"
- ".inst 0x647f41de // bfdot z30.s, z14.h, z7.h[3]\n"
- ".inst 0x647c41f3 // bfdot z19.s, z15.h, z4.h[3]\n"
- ".inst 0x647d41f7 // bfdot z23.s, z15.h, z5.h[3]\n"
- ".inst 0x647e41fb // bfdot z27.s, z15.h, z6.h[3]\n"
- ".inst 0x647f41ff // bfdot z31.s, z15.h, z7.h[3]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
index eba98bb74d..e344d82dc6 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,42 +10,49 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-#include "../bfloat.hpp"
#include "../std_transforms_sve.hpp"
+#include "../bfloat.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<bfloat16>, \
+ size_t, size_t, \
+ const bfloat16 *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_bf16fp32_dot_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_bf16fp32_dot_6x4VL( ARGLIST );
-class hybrid_bf16fp32_dot_4VLx4
+class cls_sve_hybrid_bf16fp32_dot_6x4VL
{
public:
typedef bfloat16 operand_type;
typedef float result_type;
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
{
- return 4;
+ return 6;
}
static unsigned int out_width()
@@ -63,27 +70,17 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 6, 4, 2> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_dot_4VLx4;
+ kern_type kernel=sve_hybrid_bf16fp32_dot_6x4VL;
- hybrid_bf16fp32_dot_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_bf16fp32_dot_6x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..19385e56ea
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_dot_6x4VL/generic.cpp
@@ -0,0 +1,2237 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+#include "../../bfloat.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_bf16fp32_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<bfloat16> A_arg,
+ size_t M, size_t N, const bfloat16 *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const bfloat16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 4f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "13:" // Height 1: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 18f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "23:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "27:" // Height 2: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 32f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "addvl x14, x14, #4\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "37:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "41:" // Height 3: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 46f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "51:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "55:" // Height 4: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 60f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "65:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "69:" // Height 5: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 74f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "mov z28.d, z8.d\n"
+ "mov z29.d, z9.d\n"
+ "mov z30.d, z10.d\n"
+ "mov z31.d, z11.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "79:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ ".inst 0x646040c8 // bfdot z8.s, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ ".inst 0x646040e9 // bfdot z9.s, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ ".inst 0x646140cc // bfdot z12.s, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ ".inst 0x646240d0 // bfdot z16.s, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ ".inst 0x646140ed // bfdot z13.s, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ ".inst 0x646340d4 // bfdot z20.s, z6.h, z3.h[0]\n"
+ "add x20, x20, #0x10\n"
+ ".inst 0x646240f1 // bfdot z17.s, z7.h, z2.h[0]\n"
+ ".inst 0x646440d8 // bfdot z24.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540dc // bfdot z28.s, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646340f5 // bfdot z21.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440f9 // bfdot z25.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540fd // bfdot z29.s, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646040ca // bfdot z10.s, z6.h, z0.h[0]\n"
+ ".inst 0x646140ce // bfdot z14.s, z6.h, z1.h[0]\n"
+ ".inst 0x646240d2 // bfdot z18.s, z6.h, z2.h[0]\n"
+ ".inst 0x646340d6 // bfdot z22.s, z6.h, z3.h[0]\n"
+ ".inst 0x646440da // bfdot z26.s, z6.h, z4.h[0]\n"
+ ".inst 0x646540de // bfdot z30.s, z6.h, z5.h[0]\n"
+ ".inst 0x646040eb // bfdot z11.s, z7.h, z0.h[0]\n"
+ ".inst 0x646140ef // bfdot z15.s, z7.h, z1.h[0]\n"
+ ".inst 0x646240f3 // bfdot z19.s, z7.h, z2.h[0]\n"
+ ".inst 0x646340f7 // bfdot z23.s, z7.h, z3.h[0]\n"
+ ".inst 0x646440fb // bfdot z27.s, z7.h, z4.h[0]\n"
+ ".inst 0x646540ff // bfdot z31.s, z7.h, z5.h[0]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x646840c8 // bfdot z8.s, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x646940cc // bfdot z12.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d0 // bfdot z16.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d4 // bfdot z20.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40d8 // bfdot z24.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40dc // bfdot z28.s, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x646840e9 // bfdot z9.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ed // bfdot z13.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f1 // bfdot z17.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f5 // bfdot z21.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40f9 // bfdot z25.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40fd // bfdot z29.s, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x646840ca // bfdot z10.s, z6.h, z0.h[1]\n"
+ ".inst 0x646940ce // bfdot z14.s, z6.h, z1.h[1]\n"
+ ".inst 0x646a40d2 // bfdot z18.s, z6.h, z2.h[1]\n"
+ ".inst 0x646b40d6 // bfdot z22.s, z6.h, z3.h[1]\n"
+ ".inst 0x646c40da // bfdot z26.s, z6.h, z4.h[1]\n"
+ ".inst 0x646d40de // bfdot z30.s, z6.h, z5.h[1]\n"
+ ".inst 0x646840eb // bfdot z11.s, z7.h, z0.h[1]\n"
+ ".inst 0x646940ef // bfdot z15.s, z7.h, z1.h[1]\n"
+ ".inst 0x646a40f3 // bfdot z19.s, z7.h, z2.h[1]\n"
+ ".inst 0x646b40f7 // bfdot z23.s, z7.h, z3.h[1]\n"
+ ".inst 0x646c40fb // bfdot z27.s, z7.h, z4.h[1]\n"
+ ".inst 0x646d40ff // bfdot z31.s, z7.h, z5.h[1]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647040c8 // bfdot z8.s, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x2\n"
+ ".inst 0x647140cc // bfdot z12.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d0 // bfdot z16.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d4 // bfdot z20.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440d8 // bfdot z24.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540dc // bfdot z28.s, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647040e9 // bfdot z9.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ed // bfdot z13.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f1 // bfdot z17.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f5 // bfdot z21.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440f9 // bfdot z25.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540fd // bfdot z29.s, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647040ca // bfdot z10.s, z6.h, z0.h[2]\n"
+ ".inst 0x647140ce // bfdot z14.s, z6.h, z1.h[2]\n"
+ ".inst 0x647240d2 // bfdot z18.s, z6.h, z2.h[2]\n"
+ ".inst 0x647340d6 // bfdot z22.s, z6.h, z3.h[2]\n"
+ ".inst 0x647440da // bfdot z26.s, z6.h, z4.h[2]\n"
+ ".inst 0x647540de // bfdot z30.s, z6.h, z5.h[2]\n"
+ ".inst 0x647040eb // bfdot z11.s, z7.h, z0.h[2]\n"
+ ".inst 0x647140ef // bfdot z15.s, z7.h, z1.h[2]\n"
+ ".inst 0x647240f3 // bfdot z19.s, z7.h, z2.h[2]\n"
+ ".inst 0x647340f7 // bfdot z23.s, z7.h, z3.h[2]\n"
+ ".inst 0x647440fb // bfdot z27.s, z7.h, z4.h[2]\n"
+ ".inst 0x647540ff // bfdot z31.s, z7.h, z5.h[2]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ ".inst 0x647840c8 // bfdot z8.s, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ ".inst 0x647940cc // bfdot z12.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d0 // bfdot z16.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d4 // bfdot z20.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40d8 // bfdot z24.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40dc // bfdot z28.s, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ ".inst 0x647840e9 // bfdot z9.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ed // bfdot z13.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f1 // bfdot z17.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f5 // bfdot z21.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40f9 // bfdot z25.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40fd // bfdot z29.s, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ ".inst 0x647840ca // bfdot z10.s, z6.h, z0.h[3]\n"
+ ".inst 0x647940ce // bfdot z14.s, z6.h, z1.h[3]\n"
+ ".inst 0x647a40d2 // bfdot z18.s, z6.h, z2.h[3]\n"
+ ".inst 0x647b40d6 // bfdot z22.s, z6.h, z3.h[3]\n"
+ ".inst 0x647c40da // bfdot z26.s, z6.h, z4.h[3]\n"
+ ".inst 0x647d40de // bfdot z30.s, z6.h, z5.h[3]\n"
+ ".inst 0x647840eb // bfdot z11.s, z7.h, z0.h[3]\n"
+ ".inst 0x647940ef // bfdot z15.s, z7.h, z1.h[3]\n"
+ ".inst 0x647a40f3 // bfdot z19.s, z7.h, z2.h[3]\n"
+ ".inst 0x647b40f7 // bfdot z23.s, z7.h, z3.h[3]\n"
+ ".inst 0x647c40fb // bfdot z27.s, z7.h, z4.h[3]\n"
+ ".inst 0x647d40ff // bfdot z31.s, z7.h, z5.h[3]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmin z28.s, p5/M, z28.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "fmax z28.s, p5/M, z28.s, z1.s\n"
+ "fmin z29.s, p5/M, z29.s, z0.s\n"
+ "fmin z30.s, p5/M, z30.s, z0.s\n"
+ "fmin z31.s, p5/M, z31.s, z0.s\n"
+ "fmax z29.s, p5/M, z29.s, z1.s\n"
+ "fmax z30.s, p5/M, z30.s, z1.s\n"
+ "fmax z31.s, p5/M, z31.s, z1.s\n"
+ "83:" // Height 6: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
deleted file mode 100644
index 641e5c12fd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_4VLx4
-{
-public:
- typedef bfloat16 operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 8;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<float>() * 2;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_mmla_4VLx4;
-
- hybrid_bf16fp32_mmla_4VLx4(const CPUInfo *)
- {
-
- }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 76e3546c6f..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_4VLx4(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
- float nullbias[128];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 8) {
- if (rows_to_compute % 8) {
- rows_to_compute = 8 - 1;
- } else {
- rows_to_compute = 8;
- }
- }
-
- for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z14.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.h, #0\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "mov z14.s, #0\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.h, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp1 z1.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z20.d, z16.d\n"
- "mov z21.d, z17.d\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "mov z14.s, #0\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.h, #0\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp1 z5.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 5:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "c_ptr1 .req X4\n"
- "c_ptr2 .req X5\n"
- "c_ptr3 .req X6\n"
- "c_ptr4 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z5.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z22.d, z18.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z23.d, z19.d\n"
- "mov z24.d, z16.d\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z5.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "mov z14.s, #0\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.h, #0\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.h, #0\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.h, #0\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z9.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- case 6:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "c_ptr1 .req X5\n"
- "c_ptr2 .req X6\n"
- "c_ptr3 .req X7\n"
- "c_ptr4 .req X8\n"
- "c_ptr5 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p6/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- case 7:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "c_ptr1 .req X6\n"
- "c_ptr2 .req X7\n"
- "c_ptr3 .req X8\n"
- "c_ptr4 .req X9\n"
- "c_ptr5 .req X10\n"
- "c_ptr6 .req X11\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z7.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "mov z20.d, z16.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z25.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z7.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "mov z14.s, #0\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.h, #0\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.h, #0\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.h, #0\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.h, #0\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p6/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p6/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp1 z13.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
- );
- break;
- default:
- case 8:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "a_ptr7 .req X6\n"
- "c_ptr1 .req X7\n"
- "c_ptr2 .req X8\n"
- "c_ptr3 .req X9\n"
- "c_ptr4 .req X10\n"
- "c_ptr5 .req X11\n"
- "c_ptr6 .req X12\n"
- "c_ptr7 .req X13\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr7, a_ptr6, %[lda]\n"
- "add c_ptr7, c_ptr6, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr7]\n"
- "mov z20.d, z16.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.d, z18.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z25.d, z17.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z26.d, z18.d\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "mov z27.d, z19.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z4.h, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqh z5.h, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqh z6.h, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1rqh z7.h, p7/z, [a_ptr7]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "ld1w z14.s, p0/z, [c_ptr7]\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z11.h, p7/z, [a_ptr7]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "add a_ptr7, a_ptr7, #0x20\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr7, #-0x10]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p7/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p7/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p7/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z11.h, p7/z, [a_ptr7]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqh z6.h, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr7, #0x10]\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- "addvl a_ptr7, a_ptr7, #2\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- "ld1rqh z8.h, p6/z, [a_ptr4]\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- "ld1rqh z9.h, p6/z, [a_ptr5]\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- "ld1rqh z10.h, p6/z, [a_ptr6]\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "ld1rqh z11.h, p6/z, [a_ptr7]\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- "addvl a_ptr7, a_ptr7, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e590 // bfmmla z16.s, z12.h, z0.h\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e5b1 // bfmmla z17.s, z13.h, z0.h\n"
- ".inst 0x6460e5d2 // bfmmla z18.s, z14.h, z0.h\n"
- ".inst 0x6460e5f3 // bfmmla z19.s, z15.h, z0.h\n"
- ".inst 0x6461e594 // bfmmla z20.s, z12.h, z1.h\n"
- ".inst 0x6461e5b5 // bfmmla z21.s, z13.h, z1.h\n"
- ".inst 0x6461e5d6 // bfmmla z22.s, z14.h, z1.h\n"
- ".inst 0x6461e5f7 // bfmmla z23.s, z15.h, z1.h\n"
- ".inst 0x6462e598 // bfmmla z24.s, z12.h, z2.h\n"
- ".inst 0x6462e5b9 // bfmmla z25.s, z13.h, z2.h\n"
- ".inst 0x6462e5da // bfmmla z26.s, z14.h, z2.h\n"
- ".inst 0x6462e5fb // bfmmla z27.s, z15.h, z2.h\n"
- ".inst 0x6463e59c // bfmmla z28.s, z12.h, z3.h\n"
- ".inst 0x6463e5bd // bfmmla z29.s, z13.h, z3.h\n"
- ".inst 0x6463e5de // bfmmla z30.s, z14.h, z3.h\n"
- ".inst 0x6463e5ff // bfmmla z31.s, z15.h, z3.h\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6468e590 // bfmmla z16.s, z12.h, z8.h\n"
- ".inst 0x6468e5b1 // bfmmla z17.s, z13.h, z8.h\n"
- ".inst 0x6468e5d2 // bfmmla z18.s, z14.h, z8.h\n"
- ".inst 0x6468e5f3 // bfmmla z19.s, z15.h, z8.h\n"
- ".inst 0x6469e594 // bfmmla z20.s, z12.h, z9.h\n"
- ".inst 0x6469e5b5 // bfmmla z21.s, z13.h, z9.h\n"
- ".inst 0x6469e5d6 // bfmmla z22.s, z14.h, z9.h\n"
- ".inst 0x6469e5f7 // bfmmla z23.s, z15.h, z9.h\n"
- ".inst 0x646ae598 // bfmmla z24.s, z12.h, z10.h\n"
- ".inst 0x646ae5b9 // bfmmla z25.s, z13.h, z10.h\n"
- ".inst 0x646ae5da // bfmmla z26.s, z14.h, z10.h\n"
- ".inst 0x646ae5fb // bfmmla z27.s, z15.h, z10.h\n"
- ".inst 0x646be59c // bfmmla z28.s, z12.h, z11.h\n"
- ".inst 0x646be5bd // bfmmla z29.s, z13.h, z11.h\n"
- ".inst 0x646be5de // bfmmla z30.s, z14.h, z11.h\n"
- ".inst 0x646be5ff // bfmmla z31.s, z15.h, z11.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp2 z13.s, z28.s, z29.s\n"
- "uzp1 z14.s, z30.s, z31.s\n"
- "uzp2 z15.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p0, [c_ptr7]\n"
- "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
- "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq a_ptr7\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- ".unreq c_ptr7\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
deleted file mode 100644
index bd457e9d27..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_6VLx2
-{
-public:
- typedef bfloat16 operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<float>() * 3;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 2, 6, 4> transforms = {};
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_mmla_6VLx2;
-
- hybrid_bf16fp32_mmla_6VLx2(const CPUInfo *)
- {
-
- }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
deleted file mode 100644
index 59dc6dc540..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_6VLx2/generic.cpp
+++ /dev/null
@@ -1,1633 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_6VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
- float nullbias[192];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (3 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(3 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (3 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.h, #0\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z18.s, #0\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.h, #0\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z18.s, #0\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "mov z18.s, #0\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z3.h, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "mov z1.h, #0\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z3.h, #0\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "trn2 z4.d, z2.d, z3.d\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z1.h, #0\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z3.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "uzp1 z1.s, z22.s, z23.s\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "uzp1 z2.s, z24.s, z25.s\n"
- "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z18.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "uzp2 z1.s, z20.s, z21.s\n"
- "uzp1 z2.s, z22.s, z23.s\n"
- "uzp2 z3.s, z22.s, z23.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "uzp1 z4.s, z24.s, z25.s\n"
- "uzp2 z5.s, z24.s, z25.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.h, #0\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z26.d, z20.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z27.d, z21.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "mov z28.d, z22.d\n"
- "mov z29.d, z23.d\n"
- "mov z30.d, z24.d\n"
- "mov z31.d, z25.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.h, #0\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z18.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "ld1w z17.s, p0/z, [c_ptr2]\n"
- "mov z18.s, #0\n"
- "zip1 z26.s, z17.s, z18.s\n"
- "zip2 z27.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z18.s, #0\n"
- "zip1 z28.s, z17.s, z18.s\n"
- "zip2 z29.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "mov z18.s, #0\n"
- "zip1 z30.s, z17.s, z18.s\n"
- "zip2 z31.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z5.h, #0\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z5.h, #0\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "uzp2 z1.s, z20.s, z21.s\n"
- "uzp1 z2.s, z22.s, z23.s\n"
- "uzp2 z3.s, z22.s, z23.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "fmax z26.s, p7/m, z26.s, z18.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z27.s, p7/m, z27.s, z18.s\n"
- "fmax z28.s, p7/m, z28.s, z18.s\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z19.s\n"
- "fmin z27.s, p7/m, z27.s, z19.s\n"
- "fmin z28.s, p7/m, z28.s, z19.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z24.s, z25.s\n"
- "uzp2 z5.s, z24.s, z25.s\n"
- "uzp1 z6.s, z26.s, z27.s\n"
- "fmax z29.s, p7/m, z29.s, z18.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z30.s, p7/m, z30.s, z18.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- "fmax z31.s, p7/m, z31.s, z18.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z29.s, p7/m, z29.s, z19.s\n"
- "fmin z30.s, p7/m, z30.s, z19.s\n"
- "fmin z31.s, p7/m, z31.s, z19.s\n"
- "st1w z6.s, p0, [c_ptr2]\n"
- "uzp1 z7.s, z28.s, z29.s\n"
- "uzp1 z8.s, z30.s, z31.s\n"
- "st1w z7.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z8.s, p2, [c_ptr2, #2, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z19.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z19.s, z19.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z19.s, z19.s\n"
- "ld1w z19.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z22.s, z19.s, z19.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z23.s, z19.s, z19.s\n"
- "ld1w z19.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z20.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z21.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z24.s, z19.s, z19.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z25.s, z19.s, z19.s\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z28.d, z22.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "mov z29.d, z23.d\n"
- "mov z30.d, z24.d\n"
- "mov z31.d, z25.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z17.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z18.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z20.s, z17.s, z18.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z21.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z18.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z22.s, z17.s, z18.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z23.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1w z18.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z24.s, z17.s, z18.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z25.s, z17.s, z18.s\n"
- "ld1w z17.s, p0/z, [c_ptr2]\n"
- "ld1w z18.s, p0/z, [c_ptr3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "zip1 z26.s, z17.s, z18.s\n"
- "zip2 z27.s, z17.s, z18.s\n"
- "ld1w z17.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z18.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z28.s, z17.s, z18.s\n"
- "zip2 z29.s, z17.s, z18.s\n"
- "ld1w z17.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z18.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "zip1 z30.s, z17.s, z18.s\n"
- "zip2 z31.s, z17.s, z18.s\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #12\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr3]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d4 // bfmmla z20.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f5 // bfmmla z21.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e516 // bfmmla z22.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4da // bfmmla z26.s, z6.h, z1.h\n"
- ".inst 0x6460e537 // bfmmla z23.s, z9.h, z0.h\n"
- ".inst 0x6460e558 // bfmmla z24.s, z10.h, z0.h\n"
- ".inst 0x6460e579 // bfmmla z25.s, z11.h, z0.h\n"
- ".inst 0x6461e4fb // bfmmla z27.s, z7.h, z1.h\n"
- ".inst 0x6461e51c // bfmmla z28.s, z8.h, z1.h\n"
- ".inst 0x6461e53d // bfmmla z29.s, z9.h, z1.h\n"
- ".inst 0x6461e55e // bfmmla z30.s, z10.h, z1.h\n"
- ".inst 0x6461e57f // bfmmla z31.s, z11.h, z1.h\n"
- "b.eq 5f\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4d4 // bfmmla z20.s, z6.h, z4.h\n"
- ".inst 0x6464e4f5 // bfmmla z21.s, z7.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4da // bfmmla z26.s, z6.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e4fb // bfmmla z27.s, z7.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e516 // bfmmla z22.s, z8.h, z4.h\n"
- ".inst 0x6464e537 // bfmmla z23.s, z9.h, z4.h\n"
- ".inst 0x6464e558 // bfmmla z24.s, z10.h, z4.h\n"
- ".inst 0x6464e579 // bfmmla z25.s, z11.h, z4.h\n"
- ".inst 0x6465e51c // bfmmla z28.s, z8.h, z5.h\n"
- ".inst 0x6465e53d // bfmmla z29.s, z9.h, z5.h\n"
- ".inst 0x6465e55e // bfmmla z30.s, z10.h, z5.h\n"
- ".inst 0x6465e57f // bfmmla z31.s, z11.h, z5.h\n"
- "5:\n"
- "ld1rw z18.s, p7/z, [%[minptr]]\n"
- "ld1rw z19.s, p7/z, [%[maxptr]]\n"
- "fmax z20.s, p7/m, z20.s, z18.s\n"
- "fmax z21.s, p7/m, z21.s, z18.s\n"
- "fmax z22.s, p7/m, z22.s, z18.s\n"
- "fmax z23.s, p7/m, z23.s, z18.s\n"
- "fmin z20.s, p7/m, z20.s, z19.s\n"
- "fmin z21.s, p7/m, z21.s, z19.s\n"
- "fmin z22.s, p7/m, z22.s, z19.s\n"
- "fmin z23.s, p7/m, z23.s, z19.s\n"
- "fmax z24.s, p7/m, z24.s, z18.s\n"
- "uzp1 z0.s, z20.s, z21.s\n"
- "uzp2 z1.s, z20.s, z21.s\n"
- "uzp1 z2.s, z22.s, z23.s\n"
- "uzp2 z3.s, z22.s, z23.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z24.s, p7/m, z24.s, z19.s\n"
- "fmax z25.s, p7/m, z25.s, z18.s\n"
- "fmax z26.s, p7/m, z26.s, z18.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z27.s, p7/m, z27.s, z18.s\n"
- "fmax z28.s, p7/m, z28.s, z18.s\n"
- "fmin z25.s, p7/m, z25.s, z19.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z19.s\n"
- "fmin z27.s, p7/m, z27.s, z19.s\n"
- "fmin z28.s, p7/m, z28.s, z19.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z24.s, z25.s\n"
- "uzp2 z5.s, z24.s, z25.s\n"
- "uzp1 z6.s, z26.s, z27.s\n"
- "uzp2 z7.s, z26.s, z27.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z29.s, p7/m, z29.s, z18.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #3\n"
- "fmax z30.s, p7/m, z30.s, z18.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmax z31.s, p7/m, z31.s, z18.s\n"
- "fmin z29.s, p7/m, z29.s, z19.s\n"
- "fmin z30.s, p7/m, z30.s, z19.s\n"
- "st1w z6.s, p0, [c_ptr2]\n"
- "fmin z31.s, p7/m, z31.s, z19.s\n"
- "uzp1 z8.s, z28.s, z29.s\n"
- "uzp2 z9.s, z28.s, z29.s\n"
- "st1w z7.s, p0, [c_ptr3]\n"
- "uzp1 z10.s, z30.s, z31.s\n"
- "uzp2 z11.s, z30.s, z31.s\n"
- "st1w z8.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z9.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z11.s, p2, [c_ptr3, #2, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
deleted file mode 100644
index f25f7473cb..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2.hpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-#include "../bfloat.hpp"
-#include "../std_transforms_sve.hpp"
-
-namespace arm_gemm
-{
-
-// Actual kernel implementations
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
-class hybrid_bf16fp32_mmla_8VLx2
-{
-public:
- typedef bfloat16 operand_type;
- typedef float result_type;
-
- typedef void (*kern_type)(const bfloat16 *, int, const bfloat16 *, float *, int, int, int, int, const float *, Activation, bool);
-
- /* Kernel blocking parameters */
- static constexpr unsigned int out_height()
- {
- return 4;
- }
-
- static unsigned int out_width()
- {
- return get_vector_length<float>() * 4;
- }
-
- static constexpr unsigned int k_unroll()
- {
- return 4;
- }
-
- static constexpr bool supports_accumulate()
- {
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 2, 8, 4> transforms = {};
-
- // Default to the generic kernel
- kern_type kernel=sve_hybrid_bf16fp32_mmla_8VLx2;
-
- hybrid_bf16fp32_mmla_8VLx2(const CPUInfo *)
- {
-
- }
-};
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
deleted file mode 100644
index f38a2ea2e3..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_bf16fp32_mmla_8VLx2/generic.cpp
+++ /dev/null
@@ -1,2001 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include "../../bfloat.hpp"
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_bf16fp32_mmla_8VLx2(const bfloat16 *A, int lda, const bfloat16 *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
- float nullbias[256];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const bfloat16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(bfloat16);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const bfloat16 *a_ptr0 = a_ptr0_base;
- const bfloat16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z14.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.h, #0\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z14.s, #0\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "mov z1.h, #0\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z1.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "uzp1 z1.s, z18.s, z19.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "uzp1 z2.s, z20.s, z21.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z2.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "uzp1 z3.s, z22.s, z23.s\n"
- "st1w z3.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.h, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z24.d, z16.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "mov z28.d, z20.d\n"
- "mov z29.d, z21.d\n"
- "mov z30.d, z22.d\n"
- "mov z31.d, z23.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.h, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "mov z14.s, #0\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z5.h, #0\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "mov z3.h, #0\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z5.h, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "uzp1 z9.s, z26.s, z27.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "st1w z8.s, p0, [c_ptr2]\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "st1w z9.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z10.s, z28.s, z29.s\n"
- "uzp1 z11.s, z30.s, z31.s\n"
- "st1w z10.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z11.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z24.d, z16.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z20.s, z15.s, z15.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z21.s, z15.s, z15.s\n"
- "ld1w z15.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "mov z28.d, z20.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z22.s, z15.s, z15.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "zip2 z23.s, z15.s, z15.s\n"
- "mov z29.d, z21.d\n"
- "mov z30.d, z22.d\n"
- "mov z31.d, z23.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z4.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "ld1w z14.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z14.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z14.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p7/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p7/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p7/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p7/z, [a_ptr3]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "trn1 z4.d, z0.d, z1.d\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "trn1 z5.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqh z2.h, p6/z, [%[a_ptr0]]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1rqh z3.h, p6/z, [a_ptr1]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- "ld1rqh z4.h, p6/z, [a_ptr2]\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1rqh z5.h, p6/z, [a_ptr3]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- "trn1 z0.d, z2.d, z3.d\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "trn1 z1.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z5.d, z4.d, z5.d\n"
- "ld1h z6.h, p7/z, [%[b_ptr0]]\n"
- "trn2 z4.d, z2.d, z3.d\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x6460e4d0 // bfmmla z16.s, z6.h, z0.h\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x6460e4f1 // bfmmla z17.s, z7.h, z0.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x6460e512 // bfmmla z18.s, z8.h, z0.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x6461e4d8 // bfmmla z24.s, z6.h, z1.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x6460e533 // bfmmla z19.s, z9.h, z0.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x6460e554 // bfmmla z20.s, z10.h, z0.h\n"
- ".inst 0x6460e575 // bfmmla z21.s, z11.h, z0.h\n"
- ".inst 0x6460e596 // bfmmla z22.s, z12.h, z0.h\n"
- ".inst 0x6460e5b7 // bfmmla z23.s, z13.h, z0.h\n"
- ".inst 0x6461e4f9 // bfmmla z25.s, z7.h, z1.h\n"
- ".inst 0x6461e51a // bfmmla z26.s, z8.h, z1.h\n"
- ".inst 0x6461e53b // bfmmla z27.s, z9.h, z1.h\n"
- ".inst 0x6461e55c // bfmmla z28.s, z10.h, z1.h\n"
- ".inst 0x6461e57d // bfmmla z29.s, z11.h, z1.h\n"
- ".inst 0x6461e59e // bfmmla z30.s, z12.h, z1.h\n"
- ".inst 0x6461e5bf // bfmmla z31.s, z13.h, z1.h\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z6.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z7.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x6464e4d0 // bfmmla z16.s, z6.h, z4.h\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x6464e4f1 // bfmmla z17.s, z7.h, z4.h\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x6464e512 // bfmmla z18.s, z8.h, z4.h\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x6464e533 // bfmmla z19.s, z9.h, z4.h\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- ".inst 0x6464e554 // bfmmla z20.s, z10.h, z4.h\n"
- ".inst 0x6464e575 // bfmmla z21.s, z11.h, z4.h\n"
- ".inst 0x6464e596 // bfmmla z22.s, z12.h, z4.h\n"
- ".inst 0x6464e5b7 // bfmmla z23.s, z13.h, z4.h\n"
- ".inst 0x6465e4d8 // bfmmla z24.s, z6.h, z5.h\n"
- ".inst 0x6465e4f9 // bfmmla z25.s, z7.h, z5.h\n"
- ".inst 0x6465e51a // bfmmla z26.s, z8.h, z5.h\n"
- ".inst 0x6465e53b // bfmmla z27.s, z9.h, z5.h\n"
- ".inst 0x6465e55c // bfmmla z28.s, z10.h, z5.h\n"
- ".inst 0x6465e57d // bfmmla z29.s, z11.h, z5.h\n"
- ".inst 0x6465e59e // bfmmla z30.s, z12.h, z5.h\n"
- ".inst 0x6465e5bf // bfmmla z31.s, z13.h, z5.h\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z4.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z5.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "st1w z6.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p3, [c_ptr1, #3, MUL VL]\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr2]\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr3]\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp2 z13.s, z28.s, z29.s\n"
- "uzp1 z14.s, z30.s, z31.s\n"
- "uzp2 z15.s, z30.s, z31.s\n"
- "st1w z12.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z13.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z14.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z15.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
deleted file mode 100644
index 7610a20ac0..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3778 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *A, int lda, const __fp16 *B, __fp16 *C, int ldc, int M, int N, int K, const __fp16 *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 8) / 16) - 1;
- K -= loops_count * 16;
- const long regs_count = (K / 8) - 1;
- K -= (regs_count + 1) * 8;
- const long leftovers = K;
- __fp16 nullbias[512];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<__fp16>() * sizeof(__fp16)));
- }
- __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
- __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
- const __fp16 * const minptr = &minval;
- const __fp16 * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<__fp16>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const __fp16 * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(__fp16);
-
- __fp16 *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<__fp16>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<__fp16>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = leftovers;
- const __fp16 *a_ptr0 = a_ptr0_base;
- const __fp16 *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(__fp16);
- const __fp16 *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z23.d, z19.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1h z20.h, p0/z, [c_ptr1]\n"
- "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "fmax z20.h, p7/m, z20.h, z14.h\n"
- "fmax z21.h, p7/m, z21.h, z14.h\n"
- "fmax z22.h, p7/m, z22.h, z14.h\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.h, p7/m, z23.h, z14.h\n"
- "fmin z20.h, p7/m, z20.h, z15.h\n"
- "fmin z21.h, p7/m, z21.h, z15.h\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.h, p7/m, z22.h, z15.h\n"
- "fmin z23.h, p7/m, z23.h, z15.h\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1h z20.h, p0, [c_ptr1]\n"
- "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
- "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
- "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z24.d, z16.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1h z20.h, p0/z, [c_ptr1]\n"
- "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1h z24.h, p0/z, [c_ptr2]\n"
- "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "fmax z20.h, p7/m, z20.h, z14.h\n"
- "fmax z21.h, p7/m, z21.h, z14.h\n"
- "fmax z22.h, p7/m, z22.h, z14.h\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.h, p7/m, z23.h, z14.h\n"
- "fmin z20.h, p7/m, z20.h, z15.h\n"
- "fmin z21.h, p7/m, z21.h, z15.h\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.h, p7/m, z22.h, z15.h\n"
- "fmin z23.h, p7/m, z23.h, z15.h\n"
- "fmax z24.h, p7/m, z24.h, z14.h\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.h, p7/m, z25.h, z14.h\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.h, p7/m, z26.h, z14.h\n"
- "st1h z20.h, p0, [c_ptr1]\n"
- "fmin z24.h, p7/m, z24.h, z15.h\n"
- "fmin z25.h, p7/m, z25.h, z15.h\n"
- "fmax z27.h, p7/m, z27.h, z14.h\n"
- "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.h, p7/m, z26.h, z15.h\n"
- "fmin z27.h, p7/m, z27.h, z15.h\n"
- "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
- "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
- "st1h z24.h, p0, [c_ptr2]\n"
- "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
- "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
- "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.h, %[temp], %[leftovers]\n"
- "whilelt p0.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "ptrue p7.h\n"
- "whilelt p1.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p2.h, %[temp], %[width]\n"
- "inch %[temp], all, mul #1\n"
- "whilelt p3.h, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1h z16.h, p0/z, [%[biasptr]]\n"
- "ld1h z17.h, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "mov z24.d, z16.d\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "mov z25.d, z17.d\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z28.d, z16.d\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z29.d, z17.d\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z30.d, z18.d\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z31.d, z19.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1h z16.h, p0/z, [%[c_ptr0]]\n"
- "ld1h z17.h, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1h z18.h, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1h z19.h, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1h z20.h, p0/z, [c_ptr1]\n"
- "ld1h z21.h, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1h z22.h, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1h z23.h, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1h z24.h, p0/z, [c_ptr2]\n"
- "ld1h z25.h, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1h z26.h, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1h z27.h, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1h z28.h, p0/z, [c_ptr3]\n"
- "ld1h z29.h, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1h z30.h, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1h z31.h, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqh z1.h, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqh z2.h, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqh z3.h, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z28.h, z12.h, z3.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z29.h, z13.h, z3.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z30.h, z14.h, z3.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1rqh z2.h, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z31.h, z15.h, z3.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p7/z, [a_ptr3, #-0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "fmla z28.h, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z29.h, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z30.h, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "fmla z31.h, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z28.h, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z29.h, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z30.h, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "fmla z31.h, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z28.h, z8.h, z7.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z29.h, z9.h, z7.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z30.h, z10.h, z7.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "fmla z31.h, z11.h, z7.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z28.h, z12.h, z7.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z29.h, z13.h, z7.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z30.h, z14.h, z7.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "fmla z31.h, z15.h, z7.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z28.h, z8.h, z7.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z29.h, z9.h, z7.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z30.h, z10.h, z7.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "fmla z31.h, z11.h, z7.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z28.h, z12.h, z7.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z29.h, z13.h, z7.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z30.h, z14.h, z7.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "fmla z31.h, z15.h, z7.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z28.h, z8.h, z7.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z29.h, z9.h, z7.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z30.h, z10.h, z7.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z31.h, z11.h, z7.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "fmla z28.h, z12.h, z7.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "fmla z29.h, z13.h, z7.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "fmla z30.h, z14.h, z7.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "fmla z31.h, z15.h, z7.h[7]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p7/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p7/z, [a_ptr1]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "ld1rqh z6.h, p7/z, [a_ptr2]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z7.h, p7/z, [a_ptr3]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z28.h, z12.h, z3.h[7]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z29.h, z13.h, z3.h[7]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z30.h, z14.h, z3.h[7]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "ld1rqh z0.h, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "ld1rqh z1.h, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "ld1rqh z2.h, p6/z, [a_ptr2, #0x10]\n"
- "fmla z31.h, z15.h, z3.h[7]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1rqh z3.h, p6/z, [a_ptr3, #0x10]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z28.h, z8.h, z7.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z29.h, z9.h, z7.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z30.h, z10.h, z7.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "fmla z31.h, z11.h, z7.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z28.h, z12.h, z7.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z29.h, z13.h, z7.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z30.h, z14.h, z7.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "fmla z31.h, z15.h, z7.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z28.h, z8.h, z7.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z29.h, z9.h, z7.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z30.h, z10.h, z7.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "fmla z31.h, z11.h, z7.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z28.h, z12.h, z7.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z29.h, z13.h, z7.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z30.h, z14.h, z7.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "fmla z31.h, z15.h, z7.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z28.h, z8.h, z7.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z29.h, z9.h, z7.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z30.h, z10.h, z7.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "fmla z31.h, z11.h, z7.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z28.h, z12.h, z7.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z29.h, z13.h, z7.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z30.h, z14.h, z7.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "fmla z31.h, z15.h, z7.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z28.h, z8.h, z7.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z29.h, z9.h, z7.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z30.h, z10.h, z7.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z31.h, z11.h, z7.h[6]\n"
- "fmla z16.h, z12.h, z4.h[7]\n"
- "fmla z20.h, z12.h, z5.h[7]\n"
- "fmla z24.h, z12.h, z6.h[7]\n"
- "fmla z28.h, z12.h, z7.h[7]\n"
- "fmla z17.h, z13.h, z4.h[7]\n"
- "fmla z21.h, z13.h, z5.h[7]\n"
- "fmla z25.h, z13.h, z6.h[7]\n"
- "fmla z29.h, z13.h, z7.h[7]\n"
- "fmla z18.h, z14.h, z4.h[7]\n"
- "fmla z22.h, z14.h, z5.h[7]\n"
- "fmla z26.h, z14.h, z6.h[7]\n"
- "fmla z30.h, z14.h, z7.h[7]\n"
- "fmla z19.h, z15.h, z4.h[7]\n"
- "fmla z23.h, z15.h, z5.h[7]\n"
- "fmla z27.h, z15.h, z6.h[7]\n"
- "fmla z31.h, z15.h, z7.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.h, z8.h, z0.h[0]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z8.h, z1.h[0]\n"
- "ld1rqh z4.h, p6/z, [%[a_ptr0]]\n"
- "fmla z24.h, z8.h, z2.h[0]\n"
- "ld1rqh z5.h, p6/z, [a_ptr1]\n"
- "fmla z28.h, z8.h, z3.h[0]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[0]\n"
- "ld1rqh z6.h, p6/z, [a_ptr2]\n"
- "fmla z21.h, z9.h, z1.h[0]\n"
- "ld1rqh z7.h, p6/z, [a_ptr3]\n"
- "fmla z25.h, z9.h, z2.h[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z29.h, z9.h, z3.h[0]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z22.h, z10.h, z1.h[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z26.h, z10.h, z2.h[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "fmla z30.h, z10.h, z3.h[0]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[0]\n"
- "fmla z23.h, z11.h, z1.h[0]\n"
- "fmla z27.h, z11.h, z2.h[0]\n"
- "fmla z31.h, z11.h, z3.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[1]\n"
- "fmla z20.h, z12.h, z1.h[1]\n"
- "fmla z24.h, z12.h, z2.h[1]\n"
- "fmla z28.h, z12.h, z3.h[1]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[1]\n"
- "fmla z21.h, z13.h, z1.h[1]\n"
- "fmla z25.h, z13.h, z2.h[1]\n"
- "fmla z29.h, z13.h, z3.h[1]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[1]\n"
- "fmla z22.h, z14.h, z1.h[1]\n"
- "fmla z26.h, z14.h, z2.h[1]\n"
- "fmla z30.h, z14.h, z3.h[1]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[1]\n"
- "fmla z23.h, z15.h, z1.h[1]\n"
- "fmla z27.h, z15.h, z2.h[1]\n"
- "fmla z31.h, z15.h, z3.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.h, z8.h, z1.h[2]\n"
- "fmla z24.h, z8.h, z2.h[2]\n"
- "fmla z28.h, z8.h, z3.h[2]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.h, z9.h, z0.h[2]\n"
- "fmla z21.h, z9.h, z1.h[2]\n"
- "fmla z25.h, z9.h, z2.h[2]\n"
- "fmla z29.h, z9.h, z3.h[2]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[2]\n"
- "fmla z22.h, z10.h, z1.h[2]\n"
- "fmla z26.h, z10.h, z2.h[2]\n"
- "fmla z30.h, z10.h, z3.h[2]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[2]\n"
- "fmla z23.h, z11.h, z1.h[2]\n"
- "fmla z27.h, z11.h, z2.h[2]\n"
- "fmla z31.h, z11.h, z3.h[2]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[3]\n"
- "fmla z20.h, z12.h, z1.h[3]\n"
- "fmla z24.h, z12.h, z2.h[3]\n"
- "fmla z28.h, z12.h, z3.h[3]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[3]\n"
- "fmla z21.h, z13.h, z1.h[3]\n"
- "fmla z25.h, z13.h, z2.h[3]\n"
- "fmla z29.h, z13.h, z3.h[3]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[3]\n"
- "fmla z22.h, z14.h, z1.h[3]\n"
- "fmla z26.h, z14.h, z2.h[3]\n"
- "fmla z30.h, z14.h, z3.h[3]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[3]\n"
- "fmla z23.h, z15.h, z1.h[3]\n"
- "fmla z27.h, z15.h, z2.h[3]\n"
- "fmla z31.h, z15.h, z3.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[4]\n"
- "fmla z20.h, z8.h, z1.h[4]\n"
- "fmla z24.h, z8.h, z2.h[4]\n"
- "fmla z28.h, z8.h, z3.h[4]\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "fmla z17.h, z9.h, z0.h[4]\n"
- "fmla z21.h, z9.h, z1.h[4]\n"
- "fmla z25.h, z9.h, z2.h[4]\n"
- "fmla z29.h, z9.h, z3.h[4]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.h, z10.h, z0.h[4]\n"
- "fmla z22.h, z10.h, z1.h[4]\n"
- "fmla z26.h, z10.h, z2.h[4]\n"
- "fmla z30.h, z10.h, z3.h[4]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.h, z11.h, z0.h[4]\n"
- "fmla z23.h, z11.h, z1.h[4]\n"
- "fmla z27.h, z11.h, z2.h[4]\n"
- "fmla z31.h, z11.h, z3.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.h, z12.h, z0.h[5]\n"
- "fmla z20.h, z12.h, z1.h[5]\n"
- "fmla z24.h, z12.h, z2.h[5]\n"
- "fmla z28.h, z12.h, z3.h[5]\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.h, z13.h, z0.h[5]\n"
- "fmla z21.h, z13.h, z1.h[5]\n"
- "fmla z25.h, z13.h, z2.h[5]\n"
- "fmla z29.h, z13.h, z3.h[5]\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.h, z14.h, z0.h[5]\n"
- "fmla z22.h, z14.h, z1.h[5]\n"
- "fmla z26.h, z14.h, z2.h[5]\n"
- "fmla z30.h, z14.h, z3.h[5]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.h, z15.h, z0.h[5]\n"
- "fmla z23.h, z15.h, z1.h[5]\n"
- "fmla z27.h, z15.h, z2.h[5]\n"
- "fmla z31.h, z15.h, z3.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.h, z8.h, z0.h[6]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.h, z8.h, z1.h[6]\n"
- "fmla z24.h, z8.h, z2.h[6]\n"
- "fmla z28.h, z8.h, z3.h[6]\n"
- "fmla z17.h, z9.h, z0.h[6]\n"
- "fmla z21.h, z9.h, z1.h[6]\n"
- "fmla z25.h, z9.h, z2.h[6]\n"
- "fmla z29.h, z9.h, z3.h[6]\n"
- "fmla z18.h, z10.h, z0.h[6]\n"
- "fmla z22.h, z10.h, z1.h[6]\n"
- "fmla z26.h, z10.h, z2.h[6]\n"
- "fmla z30.h, z10.h, z3.h[6]\n"
- "fmla z19.h, z11.h, z0.h[6]\n"
- "fmla z23.h, z11.h, z1.h[6]\n"
- "fmla z27.h, z11.h, z2.h[6]\n"
- "fmla z31.h, z11.h, z3.h[6]\n"
- "fmla z16.h, z12.h, z0.h[7]\n"
- "fmla z20.h, z12.h, z1.h[7]\n"
- "fmla z24.h, z12.h, z2.h[7]\n"
- "fmla z28.h, z12.h, z3.h[7]\n"
- "fmla z17.h, z13.h, z0.h[7]\n"
- "fmla z21.h, z13.h, z1.h[7]\n"
- "fmla z25.h, z13.h, z2.h[7]\n"
- "fmla z29.h, z13.h, z3.h[7]\n"
- "fmla z18.h, z14.h, z0.h[7]\n"
- "fmla z22.h, z14.h, z1.h[7]\n"
- "fmla z26.h, z14.h, z2.h[7]\n"
- "fmla z30.h, z14.h, z3.h[7]\n"
- "fmla z19.h, z15.h, z0.h[7]\n"
- "fmla z23.h, z15.h, z1.h[7]\n"
- "fmla z27.h, z15.h, z2.h[7]\n"
- "fmla z31.h, z15.h, z3.h[7]\n"
- "cbz %[blocks], 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[0]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[0]\n"
- "fmla z24.h, z8.h, z6.h[0]\n"
- "fmla z28.h, z8.h, z7.h[0]\n"
- "fmla z17.h, z9.h, z4.h[0]\n"
- "fmla z21.h, z9.h, z5.h[0]\n"
- "fmla z25.h, z9.h, z6.h[0]\n"
- "fmla z29.h, z9.h, z7.h[0]\n"
- "fmla z18.h, z10.h, z4.h[0]\n"
- "fmla z22.h, z10.h, z5.h[0]\n"
- "fmla z26.h, z10.h, z6.h[0]\n"
- "fmla z30.h, z10.h, z7.h[0]\n"
- "fmla z19.h, z11.h, z4.h[0]\n"
- "fmla z23.h, z11.h, z5.h[0]\n"
- "fmla z27.h, z11.h, z6.h[0]\n"
- "fmla z31.h, z11.h, z7.h[0]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[1]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[1]\n"
- "fmla z24.h, z12.h, z6.h[1]\n"
- "fmla z28.h, z12.h, z7.h[1]\n"
- "fmla z17.h, z13.h, z4.h[1]\n"
- "fmla z21.h, z13.h, z5.h[1]\n"
- "fmla z25.h, z13.h, z6.h[1]\n"
- "fmla z29.h, z13.h, z7.h[1]\n"
- "fmla z18.h, z14.h, z4.h[1]\n"
- "fmla z22.h, z14.h, z5.h[1]\n"
- "fmla z26.h, z14.h, z6.h[1]\n"
- "fmla z30.h, z14.h, z7.h[1]\n"
- "fmla z19.h, z15.h, z4.h[1]\n"
- "fmla z23.h, z15.h, z5.h[1]\n"
- "fmla z27.h, z15.h, z6.h[1]\n"
- "fmla z31.h, z15.h, z7.h[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[2]\n"
- "fmla z20.h, z8.h, z5.h[2]\n"
- "fmla z24.h, z8.h, z6.h[2]\n"
- "fmla z28.h, z8.h, z7.h[2]\n"
- "fmla z17.h, z9.h, z4.h[2]\n"
- "fmla z21.h, z9.h, z5.h[2]\n"
- "fmla z25.h, z9.h, z6.h[2]\n"
- "fmla z29.h, z9.h, z7.h[2]\n"
- "fmla z18.h, z10.h, z4.h[2]\n"
- "fmla z22.h, z10.h, z5.h[2]\n"
- "fmla z26.h, z10.h, z6.h[2]\n"
- "fmla z30.h, z10.h, z7.h[2]\n"
- "fmla z19.h, z11.h, z4.h[2]\n"
- "fmla z23.h, z11.h, z5.h[2]\n"
- "fmla z27.h, z11.h, z6.h[2]\n"
- "fmla z31.h, z11.h, z7.h[2]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[3]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[3]\n"
- "fmla z24.h, z12.h, z6.h[3]\n"
- "fmla z28.h, z12.h, z7.h[3]\n"
- "fmla z17.h, z13.h, z4.h[3]\n"
- "fmla z21.h, z13.h, z5.h[3]\n"
- "fmla z25.h, z13.h, z6.h[3]\n"
- "fmla z29.h, z13.h, z7.h[3]\n"
- "fmla z18.h, z14.h, z4.h[3]\n"
- "fmla z22.h, z14.h, z5.h[3]\n"
- "fmla z26.h, z14.h, z6.h[3]\n"
- "fmla z30.h, z14.h, z7.h[3]\n"
- "fmla z19.h, z15.h, z4.h[3]\n"
- "fmla z23.h, z15.h, z5.h[3]\n"
- "fmla z27.h, z15.h, z6.h[3]\n"
- "fmla z31.h, z15.h, z7.h[3]\n"
- "b.eq 5f\n"
- "ld1h z8.h, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[4]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.h, z8.h, z5.h[4]\n"
- "fmla z24.h, z8.h, z6.h[4]\n"
- "fmla z28.h, z8.h, z7.h[4]\n"
- "fmla z17.h, z9.h, z4.h[4]\n"
- "fmla z21.h, z9.h, z5.h[4]\n"
- "fmla z25.h, z9.h, z6.h[4]\n"
- "fmla z29.h, z9.h, z7.h[4]\n"
- "fmla z18.h, z10.h, z4.h[4]\n"
- "fmla z22.h, z10.h, z5.h[4]\n"
- "fmla z26.h, z10.h, z6.h[4]\n"
- "fmla z30.h, z10.h, z7.h[4]\n"
- "fmla z19.h, z11.h, z4.h[4]\n"
- "fmla z23.h, z11.h, z5.h[4]\n"
- "fmla z27.h, z11.h, z6.h[4]\n"
- "fmla z31.h, z11.h, z7.h[4]\n"
- "b.eq 5f\n"
- "ld1h z12.h, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1h z13.h, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1h z14.h, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.h, z12.h, z4.h[5]\n"
- "ld1h z15.h, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.h, z12.h, z5.h[5]\n"
- "fmla z24.h, z12.h, z6.h[5]\n"
- "fmla z28.h, z12.h, z7.h[5]\n"
- "fmla z17.h, z13.h, z4.h[5]\n"
- "fmla z21.h, z13.h, z5.h[5]\n"
- "fmla z25.h, z13.h, z6.h[5]\n"
- "fmla z29.h, z13.h, z7.h[5]\n"
- "fmla z18.h, z14.h, z4.h[5]\n"
- "fmla z22.h, z14.h, z5.h[5]\n"
- "fmla z26.h, z14.h, z6.h[5]\n"
- "fmla z30.h, z14.h, z7.h[5]\n"
- "fmla z19.h, z15.h, z4.h[5]\n"
- "fmla z23.h, z15.h, z5.h[5]\n"
- "fmla z27.h, z15.h, z6.h[5]\n"
- "fmla z31.h, z15.h, z7.h[5]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1h z8.h, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1h z9.h, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1h z10.h, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1h z11.h, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.h, z8.h, z4.h[6]\n"
- "fmla z20.h, z8.h, z5.h[6]\n"
- "fmla z24.h, z8.h, z6.h[6]\n"
- "fmla z28.h, z8.h, z7.h[6]\n"
- "fmla z17.h, z9.h, z4.h[6]\n"
- "fmla z21.h, z9.h, z5.h[6]\n"
- "fmla z25.h, z9.h, z6.h[6]\n"
- "fmla z29.h, z9.h, z7.h[6]\n"
- "fmla z18.h, z10.h, z4.h[6]\n"
- "fmla z22.h, z10.h, z5.h[6]\n"
- "fmla z26.h, z10.h, z6.h[6]\n"
- "fmla z30.h, z10.h, z7.h[6]\n"
- "fmla z19.h, z11.h, z4.h[6]\n"
- "fmla z23.h, z11.h, z5.h[6]\n"
- "fmla z27.h, z11.h, z6.h[6]\n"
- "fmla z31.h, z11.h, z7.h[6]\n"
- "5:\n"
- "ld1rh z14.h, p7/z, [%[minptr]]\n"
- "ld1rh z15.h, p7/z, [%[maxptr]]\n"
- "fmax z16.h, p7/m, z16.h, z14.h\n"
- "fmax z17.h, p7/m, z17.h, z14.h\n"
- "fmax z18.h, p7/m, z18.h, z14.h\n"
- "fmax z19.h, p7/m, z19.h, z14.h\n"
- "fmin z16.h, p7/m, z16.h, z15.h\n"
- "fmin z17.h, p7/m, z17.h, z15.h\n"
- "fmin z18.h, p7/m, z18.h, z15.h\n"
- "fmin z19.h, p7/m, z19.h, z15.h\n"
- "st1h z16.h, p0, [%[c_ptr0]]\n"
- "fmax z20.h, p7/m, z20.h, z14.h\n"
- "fmax z21.h, p7/m, z21.h, z14.h\n"
- "fmax z22.h, p7/m, z22.h, z14.h\n"
- "st1h z17.h, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.h, p7/m, z23.h, z14.h\n"
- "fmin z20.h, p7/m, z20.h, z15.h\n"
- "fmin z21.h, p7/m, z21.h, z15.h\n"
- "st1h z18.h, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.h, p7/m, z22.h, z15.h\n"
- "fmin z23.h, p7/m, z23.h, z15.h\n"
- "fmax z24.h, p7/m, z24.h, z14.h\n"
- "st1h z19.h, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.h, p7/m, z25.h, z14.h\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.h, p7/m, z26.h, z14.h\n"
- "st1h z20.h, p0, [c_ptr1]\n"
- "fmin z24.h, p7/m, z24.h, z15.h\n"
- "fmin z25.h, p7/m, z25.h, z15.h\n"
- "fmax z27.h, p7/m, z27.h, z14.h\n"
- "st1h z21.h, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.h, p7/m, z26.h, z15.h\n"
- "fmax z28.h, p7/m, z28.h, z14.h\n"
- "fmax z29.h, p7/m, z29.h, z14.h\n"
- "st1h z22.h, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z27.h, p7/m, z27.h, z15.h\n"
- "fmax z30.h, p7/m, z30.h, z14.h\n"
- "fmin z28.h, p7/m, z28.h, z15.h\n"
- "st1h z23.h, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z29.h, p7/m, z29.h, z15.h\n"
- "fmax z31.h, p7/m, z31.h, z14.h\n"
- "fmin z30.h, p7/m, z30.h, z15.h\n"
- "st1h z24.h, p0, [c_ptr2]\n"
- "fmin z31.h, p7/m, z31.h, z15.h\n"
- "st1h z25.h, p1, [c_ptr2, #1, MUL VL]\n"
- "st1h z26.h, p2, [c_ptr2, #2, MUL VL]\n"
- "st1h z27.h, p3, [c_ptr2, #3, MUL VL]\n"
- "st1h z28.h, p0, [c_ptr3]\n"
- "st1h z29.h, p1, [c_ptr3, #1, MUL VL]\n"
- "st1h z30.h, p2, [c_ptr3, #2, MUL VL]\n"
- "st1h z31.h, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
index ebef413848..0260050f29 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,42 +10,48 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<__fp16>, \
+ size_t, size_t, \
+ const __fp16 *, \
+ IndirectOutputArg<__fp16>, \
+ const __fp16 *, Activation, bool
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_fp16_mla_4VLx4(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+void sve_hybrid_fp16_mla_6x4VL( ARGLIST );
-class hybrid_fp16_mla_4VLx4
+class cls_sve_hybrid_fp16_mla_6x4VL
{
public:
typedef __fp16 operand_type;
typedef __fp16 result_type;
- typedef void (*kern_type)(const __fp16 *, int, const __fp16 *, __fp16 *, int, int, int, int, const __fp16 *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
{
- return 4;
+ return 6;
}
static unsigned int out_width()
@@ -63,27 +69,17 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_fp16_mla_4VLx4;
+ kern_type kernel=sve_hybrid_fp16_mla_6x4VL;
- hybrid_fp16_mla_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_fp16_mla_6x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..b19842b122
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp16_mla_6x4VL/generic.cpp
@@ -0,0 +1,3178 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp16_mla_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<__fp16> A_arg,
+ size_t M, size_t N, const __fp16 *B_ptr, IndirectOutputArg<__fp16> output_arg,
+ const __fp16 *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ __fp16 maxval = static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ __fp16 minval = - static_cast<__fp16>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const __fp16 *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<__fp16>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 4f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x11, #0x8\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "ble 12f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "13:" // Height 1: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 18f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "23:" // Height 2: input setup done
+ "cmp x11, #0x8\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "ble 26f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "27:" // Height 2: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "add x27, x27, x19, LSL #1\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 32f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "addvl x14, x14, #4\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "37:" // Height 3: input setup done
+ "cmp x11, #0x8\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x8\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "ble 40f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "41:" // Height 3: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 46f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x25]\n"
+ "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "51:" // Height 4: input setup done
+ "cmp x11, #0x8\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "ble 54f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmin z20.h, p5/M, z20.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "fmax z20.h, p5/M, z20.h, z1.h\n"
+ "fmin z21.h, p5/M, z21.h, z0.h\n"
+ "fmin z22.h, p5/M, z22.h, z0.h\n"
+ "fmin z23.h, p5/M, z23.h, z0.h\n"
+ "fmax z21.h, p5/M, z21.h, z1.h\n"
+ "fmax z22.h, p5/M, z22.h, z1.h\n"
+ "fmax z23.h, p5/M, z23.h, z1.h\n"
+ "55:" // Height 4: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1h { z20.h }, p4, [x25]\n"
+ "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 60f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x25]\n"
+ "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x23]\n"
+ "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "65:" // Height 5: input setup done
+ "cmp x11, #0x8\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "ble 68f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmin z20.h, p5/M, z20.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "fmax z20.h, p5/M, z20.h, z1.h\n"
+ "fmin z21.h, p5/M, z21.h, z0.h\n"
+ "fmin z22.h, p5/M, z22.h, z0.h\n"
+ "fmin z23.h, p5/M, z23.h, z0.h\n"
+ "fmin z24.h, p5/M, z24.h, z0.h\n"
+ "fmax z21.h, p5/M, z21.h, z1.h\n"
+ "fmax z22.h, p5/M, z22.h, z1.h\n"
+ "fmax z23.h, p5/M, z23.h, z1.h\n"
+ "fmax z24.h, p5/M, z24.h, z1.h\n"
+ "fmin z25.h, p5/M, z25.h, z0.h\n"
+ "fmin z26.h, p5/M, z26.h, z0.h\n"
+ "fmin z27.h, p5/M, z27.h, z0.h\n"
+ "fmax z25.h, p5/M, z25.h, z1.h\n"
+ "fmax z26.h, p5/M, z26.h, z1.h\n"
+ "fmax z27.h, p5/M, z27.h, z1.h\n"
+ "69:" // Height 5: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1h { z20.h }, p4, [x25]\n"
+ "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1h { z24.h }, p4, [x23]\n"
+ "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #1\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #1\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #1\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #1\n"
+ "add x23, x23, x19, LSL #1\n"
+ "add x21, x21, x19, LSL #1\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #1\n"
+ "add x27, x9, x19, LSL #1\n"
+ "add x25, x27, x19, LSL #1\n"
+ "add x23, x25, x19, LSL #1\n"
+ "add x21, x23, x19, LSL #1\n"
+ "add %x[output_ptr], x21, x19, LSL #1\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p3.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p2.h, x19, x16\n"
+ "inch x19\n"
+ "whilelt p1.h, x19, x16\n"
+ "cbz x14, 74f\n"
+ "ld1h { z8.h }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1h { z9.h }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1h { z10.h }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1h { z11.h }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "mov z28.d, z8.d\n"
+ "mov z29.d, z9.d\n"
+ "mov z30.d, z10.d\n"
+ "mov z31.d, z11.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1h { z8.h }, p4/Z, [x13]\n"
+ "ld1h { z9.h }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1h { z10.h }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1h { z11.h }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1h { z12.h }, p4/Z, [x9]\n"
+ "ld1h { z13.h }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1h { z14.h }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1h { z15.h }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1h { z16.h }, p4/Z, [x27]\n"
+ "ld1h { z17.h }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1h { z18.h }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1h { z19.h }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1h { z20.h }, p4/Z, [x25]\n"
+ "ld1h { z21.h }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1h { z22.h }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1h { z23.h }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1h { z24.h }, p4/Z, [x23]\n"
+ "ld1h { z25.h }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1h { z26.h }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1h { z27.h }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1h { z28.h }, p4/Z, [x21]\n"
+ "ld1h { z29.h }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1h { z30.h }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1h { z31.h }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #1\n"
+ "add x28, x28, x19, LSL #1\n"
+ "add x26, x26, x19, LSL #1\n"
+ "add x24, x24, x19, LSL #1\n"
+ "add x22, x22, x19, LSL #1\n"
+ "add x20, x20, x19, LSL #1\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #1\n"
+ "add x26, x28, x19, LSL #1\n"
+ "add x24, x26, x19, LSL #1\n"
+ "add x22, x24, x19, LSL #1\n"
+ "add x20, x22, x19, LSL #1\n"
+ "79:" // Height 6: input setup done
+ "cmp x11, #0x8\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x8\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x8\n"
+ "fmla z28.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z29.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "fmla z30.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "fmla z31.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "fmla z28.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "fmla z29.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "fmla z30.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "fmla z31.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "fmla z28.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "fmla z29.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "fmla z30.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "fmla z31.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "fmla z28.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "fmla z29.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "fmla z30.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "fmla z31.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "fmla z28.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "fmla z29.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "fmla z30.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "fmla z31.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "fmla z28.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "fmla z29.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "fmla z30.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "fmla z31.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "fmla z28.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "fmla z29.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "fmla z30.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "fmla z31.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "fmla z28.h, z6.h, z5.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "fmla z29.h, z7.h, z5.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z30.h, z6.h, z5.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z31.h, z7.h, z5.h[7]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "whilelt p0.h, XZR, x11\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqh { z0.h }, p0/Z, [x10]\n"
+ "fmla z8.h, z6.h, z0.h[0]\n"
+ "ld1rqh { z1.h }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.h, z7.h, z0.h[0]\n"
+ "ld1rqh { z2.h }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.h, z6.h, z1.h[0]\n"
+ "ld1rqh { z3.h }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.h, z6.h, z2.h[0]\n"
+ "ld1rqh { z4.h }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.h, z7.h, z1.h[0]\n"
+ "ld1rqh { z5.h }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.h, z6.h, z3.h[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z17.h, z7.h, z2.h[0]\n"
+ "fmla z24.h, z6.h, z4.h[0]\n"
+ "fmla z28.h, z6.h, z5.h[0]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.h, z7.h, z3.h[0]\n"
+ "fmla z25.h, z7.h, z4.h[0]\n"
+ "fmla z29.h, z7.h, z5.h[0]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[0]\n"
+ "fmla z14.h, z6.h, z1.h[0]\n"
+ "fmla z18.h, z6.h, z2.h[0]\n"
+ "fmla z22.h, z6.h, z3.h[0]\n"
+ "fmla z26.h, z6.h, z4.h[0]\n"
+ "fmla z30.h, z6.h, z5.h[0]\n"
+ "fmla z11.h, z7.h, z0.h[0]\n"
+ "fmla z15.h, z7.h, z1.h[0]\n"
+ "fmla z19.h, z7.h, z2.h[0]\n"
+ "fmla z23.h, z7.h, z3.h[0]\n"
+ "fmla z27.h, z7.h, z4.h[0]\n"
+ "fmla z31.h, z7.h, z5.h[0]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[1]\n"
+ "fmla z16.h, z6.h, z2.h[1]\n"
+ "fmla z20.h, z6.h, z3.h[1]\n"
+ "fmla z24.h, z6.h, z4.h[1]\n"
+ "fmla z28.h, z6.h, z5.h[1]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[1]\n"
+ "fmla z13.h, z7.h, z1.h[1]\n"
+ "fmla z17.h, z7.h, z2.h[1]\n"
+ "fmla z21.h, z7.h, z3.h[1]\n"
+ "fmla z25.h, z7.h, z4.h[1]\n"
+ "fmla z29.h, z7.h, z5.h[1]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[1]\n"
+ "fmla z14.h, z6.h, z1.h[1]\n"
+ "fmla z18.h, z6.h, z2.h[1]\n"
+ "fmla z22.h, z6.h, z3.h[1]\n"
+ "fmla z26.h, z6.h, z4.h[1]\n"
+ "fmla z30.h, z6.h, z5.h[1]\n"
+ "fmla z11.h, z7.h, z0.h[1]\n"
+ "fmla z15.h, z7.h, z1.h[1]\n"
+ "fmla z19.h, z7.h, z2.h[1]\n"
+ "fmla z23.h, z7.h, z3.h[1]\n"
+ "fmla z27.h, z7.h, z4.h[1]\n"
+ "fmla z31.h, z7.h, z5.h[1]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[2]\n"
+ "fmla z16.h, z6.h, z2.h[2]\n"
+ "fmla z20.h, z6.h, z3.h[2]\n"
+ "fmla z24.h, z6.h, z4.h[2]\n"
+ "fmla z28.h, z6.h, z5.h[2]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[2]\n"
+ "fmla z13.h, z7.h, z1.h[2]\n"
+ "fmla z17.h, z7.h, z2.h[2]\n"
+ "fmla z21.h, z7.h, z3.h[2]\n"
+ "fmla z25.h, z7.h, z4.h[2]\n"
+ "fmla z29.h, z7.h, z5.h[2]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[2]\n"
+ "fmla z14.h, z6.h, z1.h[2]\n"
+ "fmla z18.h, z6.h, z2.h[2]\n"
+ "fmla z22.h, z6.h, z3.h[2]\n"
+ "fmla z26.h, z6.h, z4.h[2]\n"
+ "fmla z30.h, z6.h, z5.h[2]\n"
+ "fmla z11.h, z7.h, z0.h[2]\n"
+ "fmla z15.h, z7.h, z1.h[2]\n"
+ "fmla z19.h, z7.h, z2.h[2]\n"
+ "fmla z23.h, z7.h, z3.h[2]\n"
+ "fmla z27.h, z7.h, z4.h[2]\n"
+ "fmla z31.h, z7.h, z5.h[2]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[3]\n"
+ "fmla z16.h, z6.h, z2.h[3]\n"
+ "fmla z20.h, z6.h, z3.h[3]\n"
+ "fmla z24.h, z6.h, z4.h[3]\n"
+ "fmla z28.h, z6.h, z5.h[3]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[3]\n"
+ "fmla z13.h, z7.h, z1.h[3]\n"
+ "fmla z17.h, z7.h, z2.h[3]\n"
+ "fmla z21.h, z7.h, z3.h[3]\n"
+ "fmla z25.h, z7.h, z4.h[3]\n"
+ "fmla z29.h, z7.h, z5.h[3]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[3]\n"
+ "fmla z14.h, z6.h, z1.h[3]\n"
+ "fmla z18.h, z6.h, z2.h[3]\n"
+ "fmla z22.h, z6.h, z3.h[3]\n"
+ "fmla z26.h, z6.h, z4.h[3]\n"
+ "fmla z30.h, z6.h, z5.h[3]\n"
+ "fmla z11.h, z7.h, z0.h[3]\n"
+ "fmla z15.h, z7.h, z1.h[3]\n"
+ "fmla z19.h, z7.h, z2.h[3]\n"
+ "fmla z23.h, z7.h, z3.h[3]\n"
+ "fmla z27.h, z7.h, z4.h[3]\n"
+ "fmla z31.h, z7.h, z5.h[3]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[4]\n"
+ "fmla z16.h, z6.h, z2.h[4]\n"
+ "fmla z20.h, z6.h, z3.h[4]\n"
+ "fmla z24.h, z6.h, z4.h[4]\n"
+ "fmla z28.h, z6.h, z5.h[4]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[4]\n"
+ "fmla z13.h, z7.h, z1.h[4]\n"
+ "fmla z17.h, z7.h, z2.h[4]\n"
+ "fmla z21.h, z7.h, z3.h[4]\n"
+ "fmla z25.h, z7.h, z4.h[4]\n"
+ "fmla z29.h, z7.h, z5.h[4]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[4]\n"
+ "fmla z14.h, z6.h, z1.h[4]\n"
+ "fmla z18.h, z6.h, z2.h[4]\n"
+ "fmla z22.h, z6.h, z3.h[4]\n"
+ "fmla z26.h, z6.h, z4.h[4]\n"
+ "fmla z30.h, z6.h, z5.h[4]\n"
+ "fmla z11.h, z7.h, z0.h[4]\n"
+ "fmla z15.h, z7.h, z1.h[4]\n"
+ "fmla z19.h, z7.h, z2.h[4]\n"
+ "fmla z23.h, z7.h, z3.h[4]\n"
+ "fmla z27.h, z7.h, z4.h[4]\n"
+ "fmla z31.h, z7.h, z5.h[4]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[5]\n"
+ "fmla z16.h, z6.h, z2.h[5]\n"
+ "fmla z20.h, z6.h, z3.h[5]\n"
+ "fmla z24.h, z6.h, z4.h[5]\n"
+ "fmla z28.h, z6.h, z5.h[5]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[5]\n"
+ "fmla z13.h, z7.h, z1.h[5]\n"
+ "fmla z17.h, z7.h, z2.h[5]\n"
+ "fmla z21.h, z7.h, z3.h[5]\n"
+ "fmla z25.h, z7.h, z4.h[5]\n"
+ "fmla z29.h, z7.h, z5.h[5]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[5]\n"
+ "fmla z14.h, z6.h, z1.h[5]\n"
+ "fmla z18.h, z6.h, z2.h[5]\n"
+ "fmla z22.h, z6.h, z3.h[5]\n"
+ "fmla z26.h, z6.h, z4.h[5]\n"
+ "fmla z30.h, z6.h, z5.h[5]\n"
+ "fmla z11.h, z7.h, z0.h[5]\n"
+ "fmla z15.h, z7.h, z1.h[5]\n"
+ "fmla z19.h, z7.h, z2.h[5]\n"
+ "fmla z23.h, z7.h, z3.h[5]\n"
+ "fmla z27.h, z7.h, z4.h[5]\n"
+ "fmla z31.h, z7.h, z5.h[5]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.h, z6.h, z1.h[6]\n"
+ "fmla z16.h, z6.h, z2.h[6]\n"
+ "fmla z20.h, z6.h, z3.h[6]\n"
+ "fmla z24.h, z6.h, z4.h[6]\n"
+ "fmla z28.h, z6.h, z5.h[6]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[6]\n"
+ "fmla z13.h, z7.h, z1.h[6]\n"
+ "fmla z17.h, z7.h, z2.h[6]\n"
+ "fmla z21.h, z7.h, z3.h[6]\n"
+ "fmla z25.h, z7.h, z4.h[6]\n"
+ "fmla z29.h, z7.h, z5.h[6]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[6]\n"
+ "fmla z14.h, z6.h, z1.h[6]\n"
+ "fmla z18.h, z6.h, z2.h[6]\n"
+ "fmla z22.h, z6.h, z3.h[6]\n"
+ "fmla z26.h, z6.h, z4.h[6]\n"
+ "fmla z30.h, z6.h, z5.h[6]\n"
+ "fmla z11.h, z7.h, z0.h[6]\n"
+ "fmla z15.h, z7.h, z1.h[6]\n"
+ "fmla z19.h, z7.h, z2.h[6]\n"
+ "fmla z23.h, z7.h, z3.h[6]\n"
+ "fmla z27.h, z7.h, z4.h[6]\n"
+ "fmla z31.h, z7.h, z5.h[6]\n"
+ "ble 82f\n"
+ "ld1h { z6.h }, p5/Z, [x15]\n"
+ "fmla z8.h, z6.h, z0.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.h, z6.h, z1.h[7]\n"
+ "fmla z16.h, z6.h, z2.h[7]\n"
+ "fmla z20.h, z6.h, z3.h[7]\n"
+ "fmla z24.h, z6.h, z4.h[7]\n"
+ "fmla z28.h, z6.h, z5.h[7]\n"
+ "ld1h { z6.h }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.h, z7.h, z0.h[7]\n"
+ "fmla z13.h, z7.h, z1.h[7]\n"
+ "fmla z17.h, z7.h, z2.h[7]\n"
+ "fmla z21.h, z7.h, z3.h[7]\n"
+ "fmla z25.h, z7.h, z4.h[7]\n"
+ "fmla z29.h, z7.h, z5.h[7]\n"
+ "ld1h { z7.h }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.h, z6.h, z0.h[7]\n"
+ "fmla z14.h, z6.h, z1.h[7]\n"
+ "fmla z18.h, z6.h, z2.h[7]\n"
+ "fmla z22.h, z6.h, z3.h[7]\n"
+ "fmla z26.h, z6.h, z4.h[7]\n"
+ "fmla z30.h, z6.h, z5.h[7]\n"
+ "fmla z11.h, z7.h, z0.h[7]\n"
+ "fmla z15.h, z7.h, z1.h[7]\n"
+ "fmla z19.h, z7.h, z2.h[7]\n"
+ "fmla z23.h, z7.h, z3.h[7]\n"
+ "fmla z27.h, z7.h, z4.h[7]\n"
+ "fmla z31.h, z7.h, z5.h[7]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rh { z1.h }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rh { z0.h }, p5/Z, [x19]\n"
+ "fmin z8.h, p5/M, z8.h, z0.h\n"
+ "fmin z9.h, p5/M, z9.h, z0.h\n"
+ "fmin z10.h, p5/M, z10.h, z0.h\n"
+ "fmin z11.h, p5/M, z11.h, z0.h\n"
+ "fmin z12.h, p5/M, z12.h, z0.h\n"
+ "fmax z8.h, p5/M, z8.h, z1.h\n"
+ "fmax z9.h, p5/M, z9.h, z1.h\n"
+ "fmax z10.h, p5/M, z10.h, z1.h\n"
+ "fmax z11.h, p5/M, z11.h, z1.h\n"
+ "fmax z12.h, p5/M, z12.h, z1.h\n"
+ "fmin z13.h, p5/M, z13.h, z0.h\n"
+ "fmin z14.h, p5/M, z14.h, z0.h\n"
+ "fmin z15.h, p5/M, z15.h, z0.h\n"
+ "fmin z16.h, p5/M, z16.h, z0.h\n"
+ "fmax z13.h, p5/M, z13.h, z1.h\n"
+ "fmax z14.h, p5/M, z14.h, z1.h\n"
+ "fmax z15.h, p5/M, z15.h, z1.h\n"
+ "fmax z16.h, p5/M, z16.h, z1.h\n"
+ "fmin z17.h, p5/M, z17.h, z0.h\n"
+ "fmin z18.h, p5/M, z18.h, z0.h\n"
+ "fmin z19.h, p5/M, z19.h, z0.h\n"
+ "fmin z20.h, p5/M, z20.h, z0.h\n"
+ "fmax z17.h, p5/M, z17.h, z1.h\n"
+ "fmax z18.h, p5/M, z18.h, z1.h\n"
+ "fmax z19.h, p5/M, z19.h, z1.h\n"
+ "fmax z20.h, p5/M, z20.h, z1.h\n"
+ "fmin z21.h, p5/M, z21.h, z0.h\n"
+ "fmin z22.h, p5/M, z22.h, z0.h\n"
+ "fmin z23.h, p5/M, z23.h, z0.h\n"
+ "fmin z24.h, p5/M, z24.h, z0.h\n"
+ "fmax z21.h, p5/M, z21.h, z1.h\n"
+ "fmax z22.h, p5/M, z22.h, z1.h\n"
+ "fmax z23.h, p5/M, z23.h, z1.h\n"
+ "fmax z24.h, p5/M, z24.h, z1.h\n"
+ "fmin z25.h, p5/M, z25.h, z0.h\n"
+ "fmin z26.h, p5/M, z26.h, z0.h\n"
+ "fmin z27.h, p5/M, z27.h, z0.h\n"
+ "fmin z28.h, p5/M, z28.h, z0.h\n"
+ "fmax z25.h, p5/M, z25.h, z1.h\n"
+ "fmax z26.h, p5/M, z26.h, z1.h\n"
+ "fmax z27.h, p5/M, z27.h, z1.h\n"
+ "fmax z28.h, p5/M, z28.h, z1.h\n"
+ "fmin z29.h, p5/M, z29.h, z0.h\n"
+ "fmin z30.h, p5/M, z30.h, z0.h\n"
+ "fmin z31.h, p5/M, z31.h, z0.h\n"
+ "fmax z29.h, p5/M, z29.h, z1.h\n"
+ "fmax z30.h, p5/M, z30.h, z1.h\n"
+ "fmax z31.h, p5/M, z31.h, z1.h\n"
+ "83:" // Height 6: No activation
+ "st1h { z8.h }, p4, [x13]\n"
+ "st1h { z9.h }, p3, [x13, #1, MUL VL]\n"
+ "st1h { z10.h }, p2, [x13, #2, MUL VL]\n"
+ "st1h { z11.h }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1h { z12.h }, p4, [x9]\n"
+ "st1h { z13.h }, p3, [x9, #1, MUL VL]\n"
+ "st1h { z14.h }, p2, [x9, #2, MUL VL]\n"
+ "st1h { z15.h }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1h { z16.h }, p4, [x27]\n"
+ "st1h { z17.h }, p3, [x27, #1, MUL VL]\n"
+ "st1h { z18.h }, p2, [x27, #2, MUL VL]\n"
+ "st1h { z19.h }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1h { z20.h }, p4, [x25]\n"
+ "st1h { z21.h }, p3, [x25, #1, MUL VL]\n"
+ "st1h { z22.h }, p2, [x25, #2, MUL VL]\n"
+ "st1h { z23.h }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1h { z24.h }, p4, [x23]\n"
+ "st1h { z25.h }, p3, [x23, #1, MUL VL]\n"
+ "st1h { z26.h }, p2, [x23, #2, MUL VL]\n"
+ "st1h { z27.h }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1h { z28.h }, p4, [x21]\n"
+ "st1h { z29.h }, p3, [x21, #1, MUL VL]\n"
+ "st1h { z30.h }, p2, [x21, #2, MUL VL]\n"
+ "st1h { z31.h }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "inch x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0xc\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
deleted file mode 100644
index ce3624340e..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2118 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = K;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long leftovers = K;
- float nullbias[256];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (4 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = leftovers;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "mov z23.d, z19.d\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "mov z24.d, z16.d\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z25.d, z17.d\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z16.s, p0/z, [%[biasptr]]\n"
- "ld1w z17.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[biasptr], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[biasptr], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "mov z21.d, z17.d\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "mov z22.d, z18.d\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "mov z23.d, z19.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "mov z24.d, z16.d\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "mov z25.d, z17.d\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z26.d, z18.d\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z27.d, z19.d\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z28.d, z16.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z29.d, z17.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z30.d, z18.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z31.d, z19.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z28.s, z12.s, z3.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z29.s, z13.s, z3.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z30.s, z14.s, z3.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- "fmla z31.s, z15.s, z3.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "fmla z28.s, z8.s, z7.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z29.s, z9.s, z7.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z30.s, z10.s, z7.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "fmla z31.s, z11.s, z7.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z28.s, z12.s, z7.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z29.s, z13.s, z7.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z30.s, z14.s, z7.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "fmla z31.s, z15.s, z7.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z28.s, z8.s, z7.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z29.s, z9.s, z7.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z30.s, z10.s, z7.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z31.s, z11.s, z7.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "fmla z28.s, z12.s, z7.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "fmla z29.s, z13.s, z7.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "fmla z30.s, z14.s, z7.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "fmla z31.s, z15.s, z7.s[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z28.s, z12.s, z3.s[3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z29.s, z13.s, z3.s[3]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z30.s, z14.s, z3.s[3]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- "fmla z31.s, z15.s, z3.s[3]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "fmla z28.s, z8.s, z7.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z29.s, z9.s, z7.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z30.s, z10.s, z7.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "fmla z31.s, z11.s, z7.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z28.s, z12.s, z7.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z29.s, z13.s, z7.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z30.s, z14.s, z7.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "fmla z31.s, z15.s, z7.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z28.s, z8.s, z7.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z29.s, z9.s, z7.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z30.s, z10.s, z7.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z31.s, z11.s, z7.s[2]\n"
- "fmla z16.s, z12.s, z4.s[3]\n"
- "fmla z20.s, z12.s, z5.s[3]\n"
- "fmla z24.s, z12.s, z6.s[3]\n"
- "fmla z28.s, z12.s, z7.s[3]\n"
- "fmla z17.s, z13.s, z4.s[3]\n"
- "fmla z21.s, z13.s, z5.s[3]\n"
- "fmla z25.s, z13.s, z6.s[3]\n"
- "fmla z29.s, z13.s, z7.s[3]\n"
- "fmla z18.s, z14.s, z4.s[3]\n"
- "fmla z22.s, z14.s, z5.s[3]\n"
- "fmla z26.s, z14.s, z6.s[3]\n"
- "fmla z30.s, z14.s, z7.s[3]\n"
- "fmla z19.s, z15.s, z4.s[3]\n"
- "fmla z23.s, z15.s, z5.s[3]\n"
- "fmla z27.s, z15.s, z6.s[3]\n"
- "fmla z31.s, z15.s, z7.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "b 5f\n"
- "4:\n"
- "fmla z16.s, z8.s, z0.s[0]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "fmla z20.s, z8.s, z1.s[0]\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "fmla z24.s, z8.s, z2.s[0]\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- "fmla z28.s, z8.s, z3.s[0]\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "fmla z17.s, z9.s, z0.s[0]\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "fmla z21.s, z9.s, z1.s[0]\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- "fmla z25.s, z9.s, z2.s[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "fmla z29.s, z9.s, z3.s[0]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "fmla z18.s, z10.s, z0.s[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "fmla z22.s, z10.s, z1.s[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "fmla z26.s, z10.s, z2.s[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "fmla z30.s, z10.s, z3.s[0]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z19.s, z11.s, z0.s[0]\n"
- "fmla z23.s, z11.s, z1.s[0]\n"
- "fmla z27.s, z11.s, z2.s[0]\n"
- "fmla z31.s, z11.s, z3.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z16.s, z12.s, z0.s[1]\n"
- "fmla z20.s, z12.s, z1.s[1]\n"
- "fmla z24.s, z12.s, z2.s[1]\n"
- "fmla z28.s, z12.s, z3.s[1]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "fmla z17.s, z13.s, z0.s[1]\n"
- "fmla z21.s, z13.s, z1.s[1]\n"
- "fmla z25.s, z13.s, z2.s[1]\n"
- "fmla z29.s, z13.s, z3.s[1]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "fmla z18.s, z14.s, z0.s[1]\n"
- "fmla z22.s, z14.s, z1.s[1]\n"
- "fmla z26.s, z14.s, z2.s[1]\n"
- "fmla z30.s, z14.s, z3.s[1]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z19.s, z15.s, z0.s[1]\n"
- "fmla z23.s, z15.s, z1.s[1]\n"
- "fmla z27.s, z15.s, z2.s[1]\n"
- "fmla z31.s, z15.s, z3.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z16.s, z8.s, z0.s[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "fmla z20.s, z8.s, z1.s[2]\n"
- "fmla z24.s, z8.s, z2.s[2]\n"
- "fmla z28.s, z8.s, z3.s[2]\n"
- "fmla z17.s, z9.s, z0.s[2]\n"
- "fmla z21.s, z9.s, z1.s[2]\n"
- "fmla z25.s, z9.s, z2.s[2]\n"
- "fmla z29.s, z9.s, z3.s[2]\n"
- "fmla z18.s, z10.s, z0.s[2]\n"
- "fmla z22.s, z10.s, z1.s[2]\n"
- "fmla z26.s, z10.s, z2.s[2]\n"
- "fmla z30.s, z10.s, z3.s[2]\n"
- "fmla z19.s, z11.s, z0.s[2]\n"
- "fmla z23.s, z11.s, z1.s[2]\n"
- "fmla z27.s, z11.s, z2.s[2]\n"
- "fmla z31.s, z11.s, z3.s[2]\n"
- "fmla z16.s, z12.s, z0.s[3]\n"
- "fmla z20.s, z12.s, z1.s[3]\n"
- "fmla z24.s, z12.s, z2.s[3]\n"
- "fmla z28.s, z12.s, z3.s[3]\n"
- "fmla z17.s, z13.s, z0.s[3]\n"
- "fmla z21.s, z13.s, z1.s[3]\n"
- "fmla z25.s, z13.s, z2.s[3]\n"
- "fmla z29.s, z13.s, z3.s[3]\n"
- "fmla z18.s, z14.s, z0.s[3]\n"
- "fmla z22.s, z14.s, z1.s[3]\n"
- "fmla z26.s, z14.s, z2.s[3]\n"
- "fmla z30.s, z14.s, z3.s[3]\n"
- "fmla z19.s, z15.s, z0.s[3]\n"
- "fmla z23.s, z15.s, z1.s[3]\n"
- "fmla z27.s, z15.s, z2.s[3]\n"
- "fmla z31.s, z15.s, z3.s[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1w z8.s, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[0]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "fmla z20.s, z8.s, z5.s[0]\n"
- "fmla z24.s, z8.s, z6.s[0]\n"
- "fmla z28.s, z8.s, z7.s[0]\n"
- "fmla z17.s, z9.s, z4.s[0]\n"
- "fmla z21.s, z9.s, z5.s[0]\n"
- "fmla z25.s, z9.s, z6.s[0]\n"
- "fmla z29.s, z9.s, z7.s[0]\n"
- "fmla z18.s, z10.s, z4.s[0]\n"
- "fmla z22.s, z10.s, z5.s[0]\n"
- "fmla z26.s, z10.s, z6.s[0]\n"
- "fmla z30.s, z10.s, z7.s[0]\n"
- "fmla z19.s, z11.s, z4.s[0]\n"
- "fmla z23.s, z11.s, z5.s[0]\n"
- "fmla z27.s, z11.s, z6.s[0]\n"
- "fmla z31.s, z11.s, z7.s[0]\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "fmla z16.s, z12.s, z4.s[1]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "fmla z20.s, z12.s, z5.s[1]\n"
- "fmla z24.s, z12.s, z6.s[1]\n"
- "fmla z28.s, z12.s, z7.s[1]\n"
- "fmla z17.s, z13.s, z4.s[1]\n"
- "fmla z21.s, z13.s, z5.s[1]\n"
- "fmla z25.s, z13.s, z6.s[1]\n"
- "fmla z29.s, z13.s, z7.s[1]\n"
- "fmla z18.s, z14.s, z4.s[1]\n"
- "fmla z22.s, z14.s, z5.s[1]\n"
- "fmla z26.s, z14.s, z6.s[1]\n"
- "fmla z30.s, z14.s, z7.s[1]\n"
- "fmla z19.s, z15.s, z4.s[1]\n"
- "fmla z23.s, z15.s, z5.s[1]\n"
- "fmla z27.s, z15.s, z6.s[1]\n"
- "fmla z31.s, z15.s, z7.s[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "ld1w z8.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z9.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z10.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z11.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "fmla z16.s, z8.s, z4.s[2]\n"
- "fmla z20.s, z8.s, z5.s[2]\n"
- "fmla z24.s, z8.s, z6.s[2]\n"
- "fmla z28.s, z8.s, z7.s[2]\n"
- "fmla z17.s, z9.s, z4.s[2]\n"
- "fmla z21.s, z9.s, z5.s[2]\n"
- "fmla z25.s, z9.s, z6.s[2]\n"
- "fmla z29.s, z9.s, z7.s[2]\n"
- "fmla z18.s, z10.s, z4.s[2]\n"
- "fmla z22.s, z10.s, z5.s[2]\n"
- "fmla z26.s, z10.s, z6.s[2]\n"
- "fmla z30.s, z10.s, z7.s[2]\n"
- "fmla z19.s, z11.s, z4.s[2]\n"
- "fmla z23.s, z11.s, z5.s[2]\n"
- "fmla z27.s, z11.s, z6.s[2]\n"
- "fmla z31.s, z11.s, z7.s[2]\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
new file mode 100644
index 0000000000..f0cc70b76e
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_fp32_mla_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_fp32_mla_6x4VL
+{
+public:
+ typedef float operand_type;
+ typedef float result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<float>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 1;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 1> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_fp32_mla_6x4VL;
+
+ cls_sve_hybrid_fp32_mla_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
new file mode 100644
index 0000000000..3a6422abd1
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_6x4VL/generic.cpp
@@ -0,0 +1,2236 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 4f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x11, #0x4\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "cmp x11, #0x4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "ble 12f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "ble 12f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "ble 12f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "13:" // Height 1: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 18f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "23:" // Height 2: input setup done
+ "cmp x11, #0x4\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x4\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "ble 26f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "ble 26f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "ble 26f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "27:" // Height 2: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 32f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "mov z13.d, z9.d\n"
+ "addvl x14, x14, #4\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "37:" // Height 3: input setup done
+ "cmp x11, #0x4\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "cmp x11, #0x4\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "ble 40f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "ble 40f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "ble 40f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "41:" // Height 3: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 46f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "51:" // Height 4: input setup done
+ "cmp x11, #0x4\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x4\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "ble 54f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "ble 54f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "ble 54f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "55:" // Height 4: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 60f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "65:" // Height 5: input setup done
+ "cmp x11, #0x4\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x4\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "ble 68f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "ble 68f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "ble 68f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "69:" // Height 5: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x16, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x14, %x[bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x16\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x16\n"
+ "cbz x14, 74f\n"
+ "ld1w { z8.s }, p5/Z, [x14]\n"
+ "mov z12.d, z8.d\n"
+ "ld1w { z9.s }, p5/Z, [x14, #1, MUL VL]\n"
+ "mov z16.d, z8.d\n"
+ "ld1w { z10.s }, p5/Z, [x14, #2, MUL VL]\n"
+ "mov z20.d, z8.d\n"
+ "ld1w { z11.s }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "mov z13.d, z9.d\n"
+ "mov z17.d, z9.d\n"
+ "mov z14.d, z10.d\n"
+ "mov z15.d, z11.d\n"
+ "mov z18.d, z10.d\n"
+ "mov z19.d, z11.d\n"
+ "mov z21.d, z9.d\n"
+ "mov z22.d, z10.d\n"
+ "mov z23.d, z11.d\n"
+ "mov z24.d, z8.d\n"
+ "mov z25.d, z9.d\n"
+ "mov z26.d, z10.d\n"
+ "mov z27.d, z11.d\n"
+ "mov z28.d, z8.d\n"
+ "mov z29.d, z9.d\n"
+ "mov z30.d, z10.d\n"
+ "mov z31.d, z11.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z8.b, #0x0\n"
+ "mov z9.b, #0x0\n"
+ "mov z10.b, #0x0\n"
+ "mov z11.b, #0x0\n"
+ "mov z12.b, #0x0\n"
+ "mov z13.b, #0x0\n"
+ "mov z14.b, #0x0\n"
+ "mov z15.b, #0x0\n"
+ "mov z16.b, #0x0\n"
+ "mov z17.b, #0x0\n"
+ "mov z18.b, #0x0\n"
+ "mov z19.b, #0x0\n"
+ "mov z20.b, #0x0\n"
+ "mov z21.b, #0x0\n"
+ "mov z22.b, #0x0\n"
+ "mov z23.b, #0x0\n"
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "79:" // Height 6: input setup done
+ "cmp x11, #0x4\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "sub x11, x11, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x4\n"
+ "fmla z28.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z29.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "fmla z30.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "fmla z31.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #5, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "fmla z28.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "fmla z29.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #7, MUL VL]\n"
+ "addvl x15, x15, #16\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "fmla z30.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-8, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "fmla z31.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-7, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "fmla z28.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-6, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "fmla z29.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-5, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "fmla z30.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-4, MUL VL]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "fmla z31.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-3, MUL VL]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "fmla z28.s, z6.s, z5.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #-2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "fmla z29.s, z7.s, z5.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #-1, MUL VL]\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z30.s, z6.s, z5.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z31.s, z7.s, z5.s[3]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "whilelt p0.s, XZR, x11\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x10]\n"
+ "fmla z8.s, z6.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z9.s, z7.s, z0.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z12.s, z6.s, z1.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z16.s, z6.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z13.s, z7.s, z1.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z20.s, z6.s, z3.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z17.s, z7.s, z2.s[0]\n"
+ "fmla z24.s, z6.s, z4.s[0]\n"
+ "fmla z28.s, z6.s, z5.s[0]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z21.s, z7.s, z3.s[0]\n"
+ "fmla z25.s, z7.s, z4.s[0]\n"
+ "fmla z29.s, z7.s, z5.s[0]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[0]\n"
+ "fmla z14.s, z6.s, z1.s[0]\n"
+ "fmla z18.s, z6.s, z2.s[0]\n"
+ "fmla z22.s, z6.s, z3.s[0]\n"
+ "fmla z26.s, z6.s, z4.s[0]\n"
+ "fmla z30.s, z6.s, z5.s[0]\n"
+ "fmla z11.s, z7.s, z0.s[0]\n"
+ "fmla z15.s, z7.s, z1.s[0]\n"
+ "fmla z19.s, z7.s, z2.s[0]\n"
+ "fmla z23.s, z7.s, z3.s[0]\n"
+ "fmla z27.s, z7.s, z4.s[0]\n"
+ "fmla z31.s, z7.s, z5.s[0]\n"
+ "ble 82f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[1]\n"
+ "fmla z16.s, z6.s, z2.s[1]\n"
+ "fmla z20.s, z6.s, z3.s[1]\n"
+ "fmla z24.s, z6.s, z4.s[1]\n"
+ "fmla z28.s, z6.s, z5.s[1]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[1]\n"
+ "fmla z13.s, z7.s, z1.s[1]\n"
+ "fmla z17.s, z7.s, z2.s[1]\n"
+ "fmla z21.s, z7.s, z3.s[1]\n"
+ "fmla z25.s, z7.s, z4.s[1]\n"
+ "fmla z29.s, z7.s, z5.s[1]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[1]\n"
+ "fmla z14.s, z6.s, z1.s[1]\n"
+ "fmla z18.s, z6.s, z2.s[1]\n"
+ "fmla z22.s, z6.s, z3.s[1]\n"
+ "fmla z26.s, z6.s, z4.s[1]\n"
+ "fmla z30.s, z6.s, z5.s[1]\n"
+ "fmla z11.s, z7.s, z0.s[1]\n"
+ "fmla z15.s, z7.s, z1.s[1]\n"
+ "fmla z19.s, z7.s, z2.s[1]\n"
+ "fmla z23.s, z7.s, z3.s[1]\n"
+ "fmla z27.s, z7.s, z4.s[1]\n"
+ "fmla z31.s, z7.s, z5.s[1]\n"
+ "ble 82f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "subs x11, x11, #0x1\n"
+ "fmla z12.s, z6.s, z1.s[2]\n"
+ "fmla z16.s, z6.s, z2.s[2]\n"
+ "fmla z20.s, z6.s, z3.s[2]\n"
+ "fmla z24.s, z6.s, z4.s[2]\n"
+ "fmla z28.s, z6.s, z5.s[2]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[2]\n"
+ "fmla z13.s, z7.s, z1.s[2]\n"
+ "fmla z17.s, z7.s, z2.s[2]\n"
+ "fmla z21.s, z7.s, z3.s[2]\n"
+ "fmla z25.s, z7.s, z4.s[2]\n"
+ "fmla z29.s, z7.s, z5.s[2]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[2]\n"
+ "fmla z14.s, z6.s, z1.s[2]\n"
+ "fmla z18.s, z6.s, z2.s[2]\n"
+ "fmla z22.s, z6.s, z3.s[2]\n"
+ "fmla z26.s, z6.s, z4.s[2]\n"
+ "fmla z30.s, z6.s, z5.s[2]\n"
+ "fmla z11.s, z7.s, z0.s[2]\n"
+ "fmla z15.s, z7.s, z1.s[2]\n"
+ "fmla z19.s, z7.s, z2.s[2]\n"
+ "fmla z23.s, z7.s, z3.s[2]\n"
+ "fmla z27.s, z7.s, z4.s[2]\n"
+ "fmla z31.s, z7.s, z5.s[2]\n"
+ "ble 82f\n"
+ "ld1w { z6.s }, p5/Z, [x15]\n"
+ "fmla z8.s, z6.s, z0.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #1, MUL VL]\n"
+ "fmla z12.s, z6.s, z1.s[3]\n"
+ "fmla z16.s, z6.s, z2.s[3]\n"
+ "fmla z20.s, z6.s, z3.s[3]\n"
+ "fmla z24.s, z6.s, z4.s[3]\n"
+ "fmla z28.s, z6.s, z5.s[3]\n"
+ "ld1w { z6.s }, p5/Z, [x15, #2, MUL VL]\n"
+ "fmla z9.s, z7.s, z0.s[3]\n"
+ "fmla z13.s, z7.s, z1.s[3]\n"
+ "fmla z17.s, z7.s, z2.s[3]\n"
+ "fmla z21.s, z7.s, z3.s[3]\n"
+ "fmla z25.s, z7.s, z4.s[3]\n"
+ "fmla z29.s, z7.s, z5.s[3]\n"
+ "ld1w { z7.s }, p5/Z, [x15, #3, MUL VL]\n"
+ "addvl x15, x15, #4\n"
+ "fmla z10.s, z6.s, z0.s[3]\n"
+ "fmla z14.s, z6.s, z1.s[3]\n"
+ "fmla z18.s, z6.s, z2.s[3]\n"
+ "fmla z22.s, z6.s, z3.s[3]\n"
+ "fmla z26.s, z6.s, z4.s[3]\n"
+ "fmla z30.s, z6.s, z5.s[3]\n"
+ "fmla z11.s, z7.s, z0.s[3]\n"
+ "fmla z15.s, z7.s, z1.s[3]\n"
+ "fmla z19.s, z7.s, z2.s[3]\n"
+ "fmla z23.s, z7.s, z3.s[3]\n"
+ "fmla z27.s, z7.s, z4.s[3]\n"
+ "fmla z31.s, z7.s, z5.s[3]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z1.s }, p5/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z0.s }, p5/Z, [x19]\n"
+ "fmin z8.s, p5/M, z8.s, z0.s\n"
+ "fmin z9.s, p5/M, z9.s, z0.s\n"
+ "fmin z10.s, p5/M, z10.s, z0.s\n"
+ "fmin z11.s, p5/M, z11.s, z0.s\n"
+ "fmin z12.s, p5/M, z12.s, z0.s\n"
+ "fmax z8.s, p5/M, z8.s, z1.s\n"
+ "fmax z9.s, p5/M, z9.s, z1.s\n"
+ "fmax z10.s, p5/M, z10.s, z1.s\n"
+ "fmax z11.s, p5/M, z11.s, z1.s\n"
+ "fmax z12.s, p5/M, z12.s, z1.s\n"
+ "fmin z13.s, p5/M, z13.s, z0.s\n"
+ "fmin z14.s, p5/M, z14.s, z0.s\n"
+ "fmin z15.s, p5/M, z15.s, z0.s\n"
+ "fmin z16.s, p5/M, z16.s, z0.s\n"
+ "fmax z13.s, p5/M, z13.s, z1.s\n"
+ "fmax z14.s, p5/M, z14.s, z1.s\n"
+ "fmax z15.s, p5/M, z15.s, z1.s\n"
+ "fmax z16.s, p5/M, z16.s, z1.s\n"
+ "fmin z17.s, p5/M, z17.s, z0.s\n"
+ "fmin z18.s, p5/M, z18.s, z0.s\n"
+ "fmin z19.s, p5/M, z19.s, z0.s\n"
+ "fmin z20.s, p5/M, z20.s, z0.s\n"
+ "fmax z17.s, p5/M, z17.s, z1.s\n"
+ "fmax z18.s, p5/M, z18.s, z1.s\n"
+ "fmax z19.s, p5/M, z19.s, z1.s\n"
+ "fmax z20.s, p5/M, z20.s, z1.s\n"
+ "fmin z21.s, p5/M, z21.s, z0.s\n"
+ "fmin z22.s, p5/M, z22.s, z0.s\n"
+ "fmin z23.s, p5/M, z23.s, z0.s\n"
+ "fmin z24.s, p5/M, z24.s, z0.s\n"
+ "fmax z21.s, p5/M, z21.s, z1.s\n"
+ "fmax z22.s, p5/M, z22.s, z1.s\n"
+ "fmax z23.s, p5/M, z23.s, z1.s\n"
+ "fmax z24.s, p5/M, z24.s, z1.s\n"
+ "fmin z25.s, p5/M, z25.s, z0.s\n"
+ "fmin z26.s, p5/M, z26.s, z0.s\n"
+ "fmin z27.s, p5/M, z27.s, z0.s\n"
+ "fmin z28.s, p5/M, z28.s, z0.s\n"
+ "fmax z25.s, p5/M, z25.s, z1.s\n"
+ "fmax z26.s, p5/M, z26.s, z1.s\n"
+ "fmax z27.s, p5/M, z27.s, z1.s\n"
+ "fmax z28.s, p5/M, z28.s, z1.s\n"
+ "fmin z29.s, p5/M, z29.s, z0.s\n"
+ "fmin z30.s, p5/M, z30.s, z0.s\n"
+ "fmin z31.s, p5/M, z31.s, z0.s\n"
+ "fmax z29.s, p5/M, z29.s, z1.s\n"
+ "fmax z30.s, p5/M, z30.s, z1.s\n"
+ "fmax z31.s, p5/M, z31.s, z1.s\n"
+ "83:" // Height 6: No activation
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x16, x16, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0x18\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
index fd416ed2f4..20d9922e93 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,37 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<float>, \
+ size_t, size_t, \
+ const float *, \
+ IndirectOutputArg<float>, \
+ const float *, Activation, bool
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_fp32_mmla_4VLx4(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_hybrid_fp32_mla_8x1VL( ARGLIST );
-class hybrid_fp32_mmla_4VLx4
+class cls_sve_hybrid_fp32_mla_8x1VL
{
public:
typedef float operand_type;
typedef float result_type;
- typedef void (*kern_type)(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -50,12 +56,12 @@ public:
static unsigned int out_width()
{
- return get_vector_length<float>() * 2;
+ return get_vector_length<float>() * 1;
}
static constexpr unsigned int k_unroll()
{
- return 2;
+ return 1;
}
static constexpr bool supports_accumulate()
@@ -63,27 +69,17 @@ public:
return true;
}
- static constexpr bool supports_bias()
- {
- return true;
- }
-
- static constexpr bool supports_activation()
- {
- return true;
- }
-
- StdTransformsSVE<operand_type, result_type, 4, 4, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_fp32_mmla_4VLx4;
+ kern_type kernel=sve_hybrid_fp32_mla_8x1VL;
- hybrid_fp32_mmla_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_fp32_mla_8x1VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
new file mode 100644
index 0000000000..361e303c7a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -0,0 +1,1751 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_fp32_mla_8x1VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<float> A_arg,
+ size_t M, size_t N, const float *B_ptr, IndirectOutputArg<float> output_arg,
+ const float *bias, Activation act, bool accumulate
+)
+{
+ struct KernelArgs {
+ float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
+ float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const float *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ switch(act.type) {
+ default:
+ case Activation::Type::None:
+ break;
+ case Activation::Type::BoundedReLU:
+ ka.maxval = static_cast<float>(act.param1);
+ /* fall through */
+ case Activation::Type::ReLU:
+ ka.minval = 0;
+ flags |= 0x2;
+ break;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x8\n"
+ "bge 99f\n"
+ "cmp %x[M], #0x6\n"
+ "bgt 85f\n"
+ "beq 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 4f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "addvl x8, x8, #1\n"
+ "b 6f\n"
+ "4:" // Height 1: no bias
+ "tbz %x[flags], #0, 5f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "b 6f\n"
+ "5:" // Height 1: no accumulate
+ "mov z24.b, #0x0\n"
+ "6:" // Height 1: setup done
+ "mov x16, #0x0\n"
+ "7:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 8f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "cbnz x16, 9f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "b 9f\n"
+ "8:" // Height 1: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "9:" // Height 1: input setup done
+ "cmp x15, #0x4\n"
+ "ble 11f\n"
+ "10:" // Height 1: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "bgt 10b\n"
+ "11:" // Height 1: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "add x14, x14, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 12f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "addvl x7, x7, #1\n"
+ "ble 12f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "addvl x7, x7, #1\n"
+ "ble 12f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "12:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 7b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "tbz %x[flags], #1, 13f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "13:" // Height 1: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 3b\n"
+ "b 114f\n"
+ "15:" // Height 2
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "17:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 18f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "b 20f\n"
+ "18:" // Height 2: no bias
+ "tbz %x[flags], #0, 19f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "b 20f\n"
+ "19:" // Height 2: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "20:" // Height 2: setup done
+ "mov x16, #0x0\n"
+ "21:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 22f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "cbnz x16, 23f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "b 23f\n"
+ "22:" // Height 2: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "23:" // Height 2: input setup done
+ "cmp x15, #0x4\n"
+ "ble 25f\n"
+ "24:" // Height 2: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "bgt 24b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "add x12, x12, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 26f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "ble 26f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "ble 26f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 21b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "tbz %x[flags], #1, 27f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "27:" // Height 2: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 17b\n"
+ "b 114f\n"
+ "29:" // Height 3
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "add x11, x11, x19, LSL #2\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "31:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 32f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "b 34f\n"
+ "32:" // Height 3: no bias
+ "tbz %x[flags], #0, 33f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "b 34f\n"
+ "33:" // Height 3: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x16, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "cbnz x16, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "37:" // Height 3: input setup done
+ "cmp x15, #0x4\n"
+ "ble 39f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "bgt 38b\n"
+ "39:" // Height 3: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "add x10, x10, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 40f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "ble 40f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "ble 40f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "40:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "tbz %x[flags], #1, 41f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "41:" // Height 3: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 31b\n"
+ "b 114f\n"
+ "43:" // Height 4
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "45:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 46f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "b 48f\n"
+ "46:" // Height 4: no bias
+ "tbz %x[flags], #0, 47f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "b 48f\n"
+ "47:" // Height 4: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "48:" // Height 4: setup done
+ "mov x16, #0x0\n"
+ "49:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 50f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "cbnz x16, 51f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 4: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "51:" // Height 4: input setup done
+ "cmp x15, #0x4\n"
+ "ble 53f\n"
+ "52:" // Height 4: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "bgt 52b\n"
+ "53:" // Height 4: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "add x28, x28, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 54f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "ble 54f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "ble 54f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "54:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 49b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbz %x[flags], #1, 55f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "55:" // Height 4: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 45b\n"
+ "b 114f\n"
+ "57:" // Height 5
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "59:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 60f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "b 62f\n"
+ "60:" // Height 5: no bias
+ "tbz %x[flags], #0, 61f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "b 62f\n"
+ "61:" // Height 5: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "62:" // Height 5: setup done
+ "mov x16, #0x0\n"
+ "63:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 64f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "cbnz x16, 65f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "b 65f\n"
+ "64:" // Height 5: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "65:" // Height 5: input setup done
+ "cmp x15, #0x4\n"
+ "ble 67f\n"
+ "66:" // Height 5: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "bgt 66b\n"
+ "67:" // Height 5: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "add x26, x26, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 68f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "ble 68f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "ble 68f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "68:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 63b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "tbz %x[flags], #1, 69f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "69:" // Height 5: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 59b\n"
+ "b 114f\n"
+ "71:" // Height 6
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "73:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 74f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z29.d, z24.d\n"
+ "b 76f\n"
+ "74:" // Height 6: no bias
+ "tbz %x[flags], #0, 75f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "ld1w { z29.s }, p1/Z, [x25]\n"
+ "b 76f\n"
+ "75:" // Height 6: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "76:" // Height 6: setup done
+ "mov x16, #0x0\n"
+ "77:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 78f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "cbnz x16, 79f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "b 79f\n"
+ "78:" // Height 6: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "79:" // Height 6: input setup done
+ "cmp x15, #0x4\n"
+ "ble 81f\n"
+ "80:" // Height 6: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "bgt 80b\n"
+ "81:" // Height 6: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z29.s, z12.s, z5.s[0]\n"
+ "add x24, x24, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 82f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "fmla z29.s, z13.s, z5.s[1]\n"
+ "ble 82f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "fmla z29.s, z14.s, z5.s[2]\n"
+ "ble 82f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "fmla z29.s, z15.s, z5.s[3]\n"
+ "82:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 77b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbz %x[flags], #1, 83f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "83:" // Height 6: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z29.s }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 73b\n"
+ "b 114f\n"
+ "85:" // Height 7
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 86f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 87f\n"
+ "86:" // Height 7: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "87:" // Height 7: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 88f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z29.d, z24.d\n"
+ "mov z30.d, z24.d\n"
+ "b 90f\n"
+ "88:" // Height 7: no bias
+ "tbz %x[flags], #0, 89f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "ld1w { z29.s }, p1/Z, [x25]\n"
+ "ld1w { z30.s }, p1/Z, [x23]\n"
+ "b 90f\n"
+ "89:" // Height 7: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "90:" // Height 7: setup done
+ "mov x16, #0x0\n"
+ "91:" // Height 7: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 92f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "cbnz x16, 93f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "b 93f\n"
+ "92:" // Height 7: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "93:" // Height 7: input setup done
+ "cmp x15, #0x4\n"
+ "ble 95f\n"
+ "94:" // Height 7: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z30.s, z8.s, z6.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z30.s, z9.s, z6.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z30.s, z10.s, z6.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z30.s, z11.s, z6.s[3]\n"
+ "bgt 94b\n"
+ "95:" // Height 7: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z29.s, z12.s, z5.s[0]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z30.s, z12.s, z6.s[0]\n"
+ "add x22, x22, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 96f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "fmla z29.s, z13.s, z5.s[1]\n"
+ "fmla z30.s, z13.s, z6.s[1]\n"
+ "ble 96f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "fmla z29.s, z14.s, z5.s[2]\n"
+ "fmla z30.s, z14.s, z6.s[2]\n"
+ "ble 96f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "fmla z29.s, z15.s, z5.s[3]\n"
+ "fmla z30.s, z15.s, z6.s[3]\n"
+ "96:" // Height 7: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 91b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbz %x[flags], #1, 97f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "97:" // Height 7: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z29.s }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "st1w { z30.s }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "98:" // Height 7: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 87b\n"
+ "b 114f\n"
+ "99:" // Height 8
+ "ldr x6, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x8, %x[bias]\n"
+ "ldr x7, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 100f\n"
+ "ldr x17, [%x[output_ptr], #0x0]\n"
+ "add x17, x17, x19, LSL #2\n"
+ "ldr x13, [%x[output_ptr], #0x8]\n"
+ "ldr x11, [%x[output_ptr], #0x10]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x18]\n"
+ "ldr x27, [%x[output_ptr], #0x20]\n"
+ "add x11, x11, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x28]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x23, [%x[output_ptr], #0x30]\n"
+ "ldr x21, [%x[output_ptr], #0x38]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add %x[output_ptr], %x[output_ptr], #0x40\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 101f\n"
+ "100:" // Height 8: setup direct output
+ "mov x17, %x[output_ptr]\n"
+ "add x13, x17, x19, LSL #2\n"
+ "add x11, x13, x19, LSL #2\n"
+ "add x9, x11, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "101:" // Height 8: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p1.s, x19, x6\n"
+ "cbz x8, 102f\n"
+ "ld1w { z24.s }, p2/Z, [x8]\n"
+ "mov z25.d, z24.d\n"
+ "addvl x8, x8, #1\n"
+ "mov z26.d, z24.d\n"
+ "mov z27.d, z24.d\n"
+ "mov z28.d, z24.d\n"
+ "mov z29.d, z24.d\n"
+ "mov z30.d, z24.d\n"
+ "mov z31.d, z24.d\n"
+ "b 104f\n"
+ "102:" // Height 8: no bias
+ "tbz %x[flags], #0, 103f\n"
+ "ld1w { z24.s }, p1/Z, [x17]\n"
+ "ld1w { z25.s }, p1/Z, [x13]\n"
+ "ld1w { z26.s }, p1/Z, [x11]\n"
+ "ld1w { z27.s }, p1/Z, [x9]\n"
+ "ld1w { z28.s }, p1/Z, [x27]\n"
+ "ld1w { z29.s }, p1/Z, [x25]\n"
+ "ld1w { z30.s }, p1/Z, [x23]\n"
+ "ld1w { z31.s }, p1/Z, [x21]\n"
+ "b 104f\n"
+ "103:" // Height 8: no accumulate
+ "mov z24.b, #0x0\n"
+ "mov z25.b, #0x0\n"
+ "mov z26.b, #0x0\n"
+ "mov z27.b, #0x0\n"
+ "mov z28.b, #0x0\n"
+ "mov z29.b, #0x0\n"
+ "mov z30.b, #0x0\n"
+ "mov z31.b, #0x0\n"
+ "104:" // Height 8: setup done
+ "mov x16, #0x0\n"
+ "105:" // Height 8: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w15, [x20, x16, LSL #0x2]\n"
+ "tbz %x[flags], #3, 106f\n"
+ "ldr x20, [%x[input_ptr], x16, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x14, [x20, #0x0]\n"
+ "ldr x12, [x20, #0x8]\n"
+ "ldr x10, [x20, #0x10]\n"
+ "ldr x28, [x20, #0x18]\n"
+ "ldr x26, [x20, #0x20]\n"
+ "ldr x24, [x20, #0x28]\n"
+ "ldr x22, [x20, #0x30]\n"
+ "ldr x20, [x20, #0x38]\n"
+ "cbnz x16, 107f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x14, x14, x19, LSL #2\n"
+ "add x12, x12, x19, LSL #2\n"
+ "add x10, x10, x19, LSL #2\n"
+ "add x28, x28, x19, LSL #2\n"
+ "add x26, x26, x19, LSL #2\n"
+ "add x24, x24, x19, LSL #2\n"
+ "add x22, x22, x19, LSL #2\n"
+ "add x20, x20, x19, LSL #2\n"
+ "b 107f\n"
+ "106:" // Height 8: setup direct input
+ "mov x14, %x[input_ptr]\n"
+ "add x12, x14, x19, LSL #2\n"
+ "add x10, x12, x19, LSL #2\n"
+ "add x28, x10, x19, LSL #2\n"
+ "add x26, x28, x19, LSL #2\n"
+ "add x24, x26, x19, LSL #2\n"
+ "add x22, x24, x19, LSL #2\n"
+ "add x20, x22, x19, LSL #2\n"
+ "107:" // Height 8: input setup done
+ "cmp x15, #0x4\n"
+ "ble 109f\n"
+ "108:" // Height 8: Multiply loop: Main loop head
+ "ld1w { z8.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "ld1w { z9.s }, p2/Z, [x7, #1, MUL VL]\n"
+ "sub x15, x15, #0x4\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z8.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z8.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z24.s, z9.s, z0.s[1]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z26.s, z8.s, z2.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z27.s, z8.s, z3.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z25.s, z9.s, z1.s[1]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z28.s, z8.s, z4.s[0]\n"
+ "ld1rqw { z7.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z29.s, z8.s, z5.s[0]\n"
+ "ld1w { z10.s }, p2/Z, [x7, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "fmla z30.s, z8.s, z6.s[0]\n"
+ "ld1w { z11.s }, p2/Z, [x7, #3, MUL VL]\n"
+ "cmp x15, #0x4\n"
+ "fmla z31.s, z8.s, z7.s[0]\n"
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "addvl x7, x7, #4\n"
+ "fmla z26.s, z9.s, z2.s[1]\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "fmla z27.s, z9.s, z3.s[1]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "fmla z28.s, z9.s, z4.s[1]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "fmla z29.s, z9.s, z5.s[1]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "fmla z30.s, z9.s, z6.s[1]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "fmla z31.s, z9.s, z7.s[1]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "fmla z24.s, z10.s, z0.s[2]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "fmla z25.s, z10.s, z1.s[2]\n"
+ "fmla z26.s, z10.s, z2.s[2]\n"
+ "fmla z27.s, z10.s, z3.s[2]\n"
+ "fmla z28.s, z10.s, z4.s[2]\n"
+ "fmla z29.s, z10.s, z5.s[2]\n"
+ "fmla z30.s, z10.s, z6.s[2]\n"
+ "fmla z31.s, z10.s, z7.s[2]\n"
+ "fmla z24.s, z11.s, z0.s[3]\n"
+ "fmla z25.s, z11.s, z1.s[3]\n"
+ "fmla z26.s, z11.s, z2.s[3]\n"
+ "fmla z27.s, z11.s, z3.s[3]\n"
+ "fmla z28.s, z11.s, z4.s[3]\n"
+ "fmla z29.s, z11.s, z5.s[3]\n"
+ "fmla z30.s, z11.s, z6.s[3]\n"
+ "fmla z31.s, z11.s, z7.s[3]\n"
+ "bgt 108b\n"
+ "109:" // Height 8: Multiply loop: Single iteration only
+ "ld1w { z12.s }, p2/Z, [x7]\n"
+ "whilelt p0.s, XZR, x15\n"
+ "subs x15, x15, #0x1\n"
+ "ld1rqw { z0.s }, p0/Z, [x14]\n"
+ "fmla z24.s, z12.s, z0.s[0]\n"
+ "ld1rqw { z1.s }, p0/Z, [x12]\n"
+ "add x14, x14, #0x10\n"
+ "fmla z25.s, z12.s, z1.s[0]\n"
+ "ld1rqw { z2.s }, p0/Z, [x10]\n"
+ "add x12, x12, #0x10\n"
+ "fmla z26.s, z12.s, z2.s[0]\n"
+ "ld1rqw { z3.s }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "fmla z27.s, z12.s, z3.s[0]\n"
+ "ld1rqw { z4.s }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "fmla z28.s, z12.s, z4.s[0]\n"
+ "ld1rqw { z5.s }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "fmla z29.s, z12.s, z5.s[0]\n"
+ "ld1rqw { z6.s }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "fmla z30.s, z12.s, z6.s[0]\n"
+ "ld1rqw { z7.s }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "fmla z31.s, z12.s, z7.s[0]\n"
+ "add x20, x20, #0x10\n"
+ "addvl x7, x7, #1\n"
+ "ble 110f\n"
+ "ld1w { z13.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z13.s, z0.s[1]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z13.s, z1.s[1]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z13.s, z2.s[1]\n"
+ "fmla z27.s, z13.s, z3.s[1]\n"
+ "fmla z28.s, z13.s, z4.s[1]\n"
+ "fmla z29.s, z13.s, z5.s[1]\n"
+ "fmla z30.s, z13.s, z6.s[1]\n"
+ "fmla z31.s, z13.s, z7.s[1]\n"
+ "ble 110f\n"
+ "ld1w { z14.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z14.s, z0.s[2]\n"
+ "subs x15, x15, #0x1\n"
+ "fmla z25.s, z14.s, z1.s[2]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z26.s, z14.s, z2.s[2]\n"
+ "fmla z27.s, z14.s, z3.s[2]\n"
+ "fmla z28.s, z14.s, z4.s[2]\n"
+ "fmla z29.s, z14.s, z5.s[2]\n"
+ "fmla z30.s, z14.s, z6.s[2]\n"
+ "fmla z31.s, z14.s, z7.s[2]\n"
+ "ble 110f\n"
+ "ld1w { z15.s }, p2/Z, [x7]\n"
+ "fmla z24.s, z15.s, z0.s[3]\n"
+ "addvl x7, x7, #1\n"
+ "fmla z25.s, z15.s, z1.s[3]\n"
+ "fmla z26.s, z15.s, z2.s[3]\n"
+ "fmla z27.s, z15.s, z3.s[3]\n"
+ "fmla z28.s, z15.s, z4.s[3]\n"
+ "fmla z29.s, z15.s, z5.s[3]\n"
+ "fmla z30.s, z15.s, z6.s[3]\n"
+ "fmla z31.s, z15.s, z7.s[3]\n"
+ "110:" // Height 8: Multiply loop: multiply skip
+ "prfm pldl1keep, [x14, #0x80]\n"
+ "add x16, x16, #0x1\n"
+ "prfm pldl1keep, [x12, #0x80]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x16, x19\n"
+ "bne 105b\n"
+ "prfm pstl1keep, [x17, #0x0]\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x11, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbz %x[flags], #1, 111f\n"
+ "add x19, %x[args_ptr], %[offset_min]\n"
+ "ld1rw { z17.s }, p2/Z, [x19]\n"
+ "add x19, %x[args_ptr], %[offset_max]\n"
+ "ld1rw { z16.s }, p2/Z, [x19]\n"
+ "fmin z24.s, p2/M, z24.s, z16.s\n"
+ "fmin z25.s, p2/M, z25.s, z16.s\n"
+ "fmin z26.s, p2/M, z26.s, z16.s\n"
+ "fmin z27.s, p2/M, z27.s, z16.s\n"
+ "fmin z28.s, p2/M, z28.s, z16.s\n"
+ "fmax z24.s, p2/M, z24.s, z17.s\n"
+ "fmax z25.s, p2/M, z25.s, z17.s\n"
+ "fmax z26.s, p2/M, z26.s, z17.s\n"
+ "fmax z27.s, p2/M, z27.s, z17.s\n"
+ "fmax z28.s, p2/M, z28.s, z17.s\n"
+ "fmin z29.s, p2/M, z29.s, z16.s\n"
+ "fmin z30.s, p2/M, z30.s, z16.s\n"
+ "fmin z31.s, p2/M, z31.s, z16.s\n"
+ "fmax z29.s, p2/M, z29.s, z17.s\n"
+ "fmax z30.s, p2/M, z30.s, z17.s\n"
+ "fmax z31.s, p2/M, z31.s, z17.s\n"
+ "111:" // Height 8: No activation
+ "st1w { z24.s }, p1, [x17]\n"
+ "addvl x17, x17, #1\n"
+ "st1w { z25.s }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "st1w { z26.s }, p1, [x11]\n"
+ "addvl x11, x11, #1\n"
+ "st1w { z27.s }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "st1w { z28.s }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "st1w { z29.s }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "st1w { z30.s }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "st1w { z31.s }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "112:" // Height 8: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19\n"
+ "subs x6, x6, x19\n"
+ "bgt 101b\n"
+ "subs %x[M], %x[M], #0x8\n"
+ "beq 114f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 113f\n"
+ "add x20, x20, #0x8\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "113:" // Update direct input
+ "mov x19, #0x20\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "114:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [bias] "r" (bias), [flags] "r" (flags), [offset_max] "I" (offsetof(KernelArgs, maxval)), [offset_min] "I" (offsetof(KernelArgs, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
deleted file mode 100644
index 1364585604..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_fp32_mmla_4VLx4/generic.cpp
+++ /dev/null
@@ -1,3459 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_fp32_mmla_4VLx4(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool accumulate) {
- const int K_stride = ((K + 1) / 2) * 2;
- const long loops_count = ((K + 4) / 8) - 1;
- K -= loops_count * 8;
- const long regs_count = (K / 4) - 1;
- K -= (regs_count + 1) * 4;
- const long leftovers = K;
- const long blocks_count = (K + 1) / 2;
- float nullbias[128];
- if (!accumulate && !bias) {
- memset(nullbias, 0, (2 * get_vector_length<float>() * sizeof(float)));
- }
- float minval = - static_cast<float>(std::numeric_limits<float>::infinity());
- float maxval = static_cast<float>(std::numeric_limits<float>::infinity());
- const float * const minptr = &minval;
- const float * const maxptr = &maxval;
-
- switch(act.type)
- {
- default:
- case Activation::Type::None:
- break;
- case Activation::Type::BoundedReLU:
- maxval = static_cast<float>(act.param1);
- /* fall through */
- case Activation::Type::ReLU:
- minval = 0.0f;
- break;
- }
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const float * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(float);
-
- float *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 8) {
- if (rows_to_compute % 8) {
- rows_to_compute = 8 - 1;
- } else {
- rows_to_compute = 8;
- }
- }
-
- for (int x0=0; x0<N; x0+=(2 * get_vector_length<float>())) {
- const long width = std::min((unsigned long)N-x0, (2 * get_vector_length<float>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const float *a_ptr0 = a_ptr0_base;
- const float *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(float);
- const float *biasptr = bias ? bias+x0 : nullbias;
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z1.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z14.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "mov z1.s, #0\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "mov z14.s, #0\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.s, #0\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "mov z1.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z5.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp1 z1.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z3.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z20.d, z16.d\n"
- "mov z21.d, z17.d\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z3.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "mov z14.s, #0\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.s, #0\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.s, #0\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn2 z8.d, z4.d, z5.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "trn2 z9.d, z6.d, z7.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z3.s, #0\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z7.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp1 z5.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "st1w z5.s, p1, [c_ptr2, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z22.d, z18.d\n"
- "mov z23.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- case 5:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "c_ptr1 .req X4\n"
- "c_ptr2 .req X5\n"
- "c_ptr3 .req X6\n"
- "c_ptr4 .req X7\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z5.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z21.d, z17.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z22.d, z18.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z23.d, z19.d\n"
- "mov z24.d, z16.d\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z5.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "mov z14.s, #0\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.s, #0\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.s, #0\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z5.s, #0\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z9.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z9.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p1, [c_ptr4, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "cc", "memory"
- );
- break;
- case 6:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "c_ptr1 .req X5\n"
- "c_ptr2 .req X6\n"
- "c_ptr3 .req X7\n"
- "c_ptr4 .req X8\n"
- "c_ptr5 .req X9\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.d, z16.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z25.d, z17.d\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "subs %[loops], %[loops], #0x1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p6/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "cc", "memory"
- );
- break;
- case 7:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "c_ptr1 .req X6\n"
- "c_ptr2 .req X7\n"
- "c_ptr3 .req X8\n"
- "c_ptr4 .req X9\n"
- "c_ptr5 .req X10\n"
- "c_ptr6 .req X11\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z7.s, #0\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "mov z20.d, z16.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z22.d, z18.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z25.d, z17.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z26.d, z18.d\n"
- "mov z27.d, z19.d\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "mov z7.s, #0\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "mov z14.s, #0\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "mov z14.s, #0\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.s, #0\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.s, #0\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.s, #0\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "mov z7.s, #0\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p6/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p6/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z11.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp1 z13.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p1, [c_ptr6, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "cc", "memory"
- );
- break;
- default:
- case 8:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "a_ptr4 .req X3\n"
- "a_ptr5 .req X4\n"
- "a_ptr6 .req X5\n"
- "a_ptr7 .req X6\n"
- "c_ptr1 .req X7\n"
- "c_ptr2 .req X8\n"
- "c_ptr3 .req X9\n"
- "c_ptr4 .req X10\n"
- "c_ptr5 .req X11\n"
- "c_ptr6 .req X12\n"
- "c_ptr7 .req X13\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "add a_ptr4, a_ptr3, %[lda]\n"
- "add c_ptr4, c_ptr3, %[ldc]\n"
- "add a_ptr5, a_ptr4, %[lda]\n"
- "add c_ptr5, c_ptr4, %[ldc]\n"
- "add a_ptr6, a_ptr5, %[lda]\n"
- "add c_ptr6, c_ptr5, %[ldc]\n"
- "add a_ptr7, a_ptr6, %[lda]\n"
- "add c_ptr7, c_ptr6, %[ldc]\n"
- "whilelt p6.s, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.s\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "ld1w z15.s, p0/z, [%[biasptr]]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z15.s, z15.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z15.s, z15.s\n"
- "ld1w z15.s, p1/z, [%[biasptr], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "zip1 z18.s, z15.s, z15.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z15.s, z15.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr7]\n"
- "mov z20.d, z16.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.d, z17.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.d, z18.d\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z23.d, z19.d\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "mov z24.d, z16.d\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "mov z25.d, z17.d\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "mov z26.d, z18.d\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "mov z27.d, z19.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "mov z28.d, z16.d\n"
- "mov z29.d, z17.d\n"
- "mov z30.d, z18.d\n"
- "mov z31.d, z19.d\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z13.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z14.s, p0/z, [c_ptr1]\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqw z1.s, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "zip1 z16.s, z13.s, z14.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2]\n"
- "zip2 z17.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "trn1 z8.d, z0.d, z1.d\n"
- "ld1w z14.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1rqw z3.s, p7/z, [a_ptr3]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqw z4.s, p7/z, [a_ptr4]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "zip1 z18.s, z13.s, z14.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5]\n"
- "zip2 z19.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr2]\n"
- "trn1 z9.d, z2.d, z3.d\n"
- "ld1w z14.s, p0/z, [c_ptr3]\n"
- "ld1rqw z6.s, p7/z, [a_ptr6]\n"
- "add a_ptr4, a_ptr4, #0x10\n"
- "trn1 z10.d, z4.d, z5.d\n"
- "ld1rqw z7.s, p7/z, [a_ptr7]\n"
- "zip1 z20.s, z13.s, z14.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "zip2 z21.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "add a_ptr5, a_ptr5, #0x10\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add a_ptr6, a_ptr6, #0x10\n"
- "zip1 z22.s, z13.s, z14.s\n"
- "add a_ptr7, a_ptr7, #0x10\n"
- "zip2 z23.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr4]\n"
- "ld1w z14.s, p0/z, [c_ptr5]\n"
- "zip1 z24.s, z13.s, z14.s\n"
- "zip2 z25.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr4, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr5, #1, MUL VL]\n"
- "zip1 z26.s, z13.s, z14.s\n"
- "zip2 z27.s, z13.s, z14.s\n"
- "ld1w z13.s, p0/z, [c_ptr6]\n"
- "ld1w z14.s, p0/z, [c_ptr7]\n"
- "zip1 z28.s, z13.s, z14.s\n"
- "zip2 z29.s, z13.s, z14.s\n"
- "ld1w z13.s, p1/z, [c_ptr6, #1, MUL VL]\n"
- "ld1w z14.s, p1/z, [c_ptr7, #1, MUL VL]\n"
- "zip1 z30.s, z13.s, z14.s\n"
- "zip2 z31.s, z13.s, z14.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "subs %[loops], %[loops], #0x1\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "add a_ptr4, a_ptr4, #0x20\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "add a_ptr5, a_ptr5, #0x20\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z11.s, p7/z, [a_ptr7]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "add a_ptr6, a_ptr6, #0x20\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "add a_ptr7, a_ptr7, #0x20\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p7/z, [a_ptr6, #-0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [a_ptr4, #-0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p7/z, [a_ptr5, #-0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr7, #-0x10]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p7/z, [%[a_ptr0], #-0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p7/z, [a_ptr1, #-0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p7/z, [a_ptr2, #-0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z3.s, p7/z, [a_ptr3, #-0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p7/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p7/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p7/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p7/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p7/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p7/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p7/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z11.s, p7/z, [a_ptr7]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1rqw z6.s, p6/z, [a_ptr6, #0x10]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [a_ptr4, #0x10]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z5.s, p6/z, [a_ptr5, #0x10]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr7, #0x10]\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr4, a_ptr4, #2\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "ld1rqw z0.s, p6/z, [%[a_ptr0], #0x10]\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- "addvl a_ptr5, a_ptr5, #2\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- "addvl a_ptr6, a_ptr6, #2\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "ld1rqw z1.s, p6/z, [a_ptr1, #0x10]\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- "addvl a_ptr7, a_ptr7, #2\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "ld1rqw z2.s, p6/z, [a_ptr2, #0x10]\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z3.s, p6/z, [a_ptr3, #0x10]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #-4\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "trn1 z8.d, z0.d, z1.d\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "trn1 z9.d, z2.d, z3.d\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "trn1 z10.d, z4.d, z5.d\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "trn1 z11.d, z6.d, z7.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b 5f\n"
- "4:\n"
- "trn2 z0.d, z0.d, z1.d\n"
- "trn2 z1.d, z2.d, z3.d\n"
- "trn2 z2.d, z4.d, z5.d\n"
- "ld1rqw z4.s, p6/z, [%[a_ptr0]]\n"
- "trn2 z3.d, z6.d, z7.d\n"
- "ld1rqw z5.s, p6/z, [a_ptr1]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- "ld1rqw z6.s, p6/z, [a_ptr2]\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- "ld1rqw z7.s, p6/z, [a_ptr3]\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- "ld1rqw z8.s, p6/z, [a_ptr4]\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- "ld1rqw z9.s, p6/z, [a_ptr5]\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- "addvl a_ptr4, a_ptr4, #1\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- "addvl a_ptr5, a_ptr5, #1\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- "ld1rqw z10.s, p6/z, [a_ptr6]\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "ld1rqw z11.s, p6/z, [a_ptr7]\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- "addvl a_ptr6, a_ptr6, #1\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- "addvl a_ptr7, a_ptr7, #1\n"
- "trn1 z0.d, z4.d, z5.d\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- "trn1 z1.d, z6.d, z7.d\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- "trn1 z2.d, z8.d, z9.d\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "trn1 z3.d, z10.d, z11.d\n"
- "cbz %[blocks], 5f\n"
- "trn2 z11.d, z10.d, z11.d\n"
- "ld1w z12.s, p7/z, [%[b_ptr0]]\n"
- "trn2 z10.d, z8.d, z9.d\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "trn2 z9.d, z6.d, z7.d\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "trn2 z8.d, z4.d, z5.d\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- ".inst 0x64a0e590 // fmmla z16.s, z12.s, z0.s\n"
- "subs %[blocks], %[blocks], #0x1\n"
- ".inst 0x64a0e5b1 // fmmla z17.s, z13.s, z0.s\n"
- ".inst 0x64a0e5d2 // fmmla z18.s, z14.s, z0.s\n"
- ".inst 0x64a0e5f3 // fmmla z19.s, z15.s, z0.s\n"
- ".inst 0x64a1e594 // fmmla z20.s, z12.s, z1.s\n"
- ".inst 0x64a1e5b5 // fmmla z21.s, z13.s, z1.s\n"
- ".inst 0x64a1e5d6 // fmmla z22.s, z14.s, z1.s\n"
- ".inst 0x64a1e5f7 // fmmla z23.s, z15.s, z1.s\n"
- ".inst 0x64a2e598 // fmmla z24.s, z12.s, z2.s\n"
- ".inst 0x64a2e5b9 // fmmla z25.s, z13.s, z2.s\n"
- ".inst 0x64a2e5da // fmmla z26.s, z14.s, z2.s\n"
- ".inst 0x64a2e5fb // fmmla z27.s, z15.s, z2.s\n"
- ".inst 0x64a3e59c // fmmla z28.s, z12.s, z3.s\n"
- ".inst 0x64a3e5bd // fmmla z29.s, z13.s, z3.s\n"
- ".inst 0x64a3e5de // fmmla z30.s, z14.s, z3.s\n"
- ".inst 0x64a3e5ff // fmmla z31.s, z15.s, z3.s\n"
- "b.eq 5f\n"
- "ld1w z12.s, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1w z13.s, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1w z14.s, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1w z15.s, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- ".inst 0x64a8e590 // fmmla z16.s, z12.s, z8.s\n"
- ".inst 0x64a8e5b1 // fmmla z17.s, z13.s, z8.s\n"
- ".inst 0x64a8e5d2 // fmmla z18.s, z14.s, z8.s\n"
- ".inst 0x64a8e5f3 // fmmla z19.s, z15.s, z8.s\n"
- ".inst 0x64a9e594 // fmmla z20.s, z12.s, z9.s\n"
- ".inst 0x64a9e5b5 // fmmla z21.s, z13.s, z9.s\n"
- ".inst 0x64a9e5d6 // fmmla z22.s, z14.s, z9.s\n"
- ".inst 0x64a9e5f7 // fmmla z23.s, z15.s, z9.s\n"
- ".inst 0x64aae598 // fmmla z24.s, z12.s, z10.s\n"
- ".inst 0x64aae5b9 // fmmla z25.s, z13.s, z10.s\n"
- ".inst 0x64aae5da // fmmla z26.s, z14.s, z10.s\n"
- ".inst 0x64aae5fb // fmmla z27.s, z15.s, z10.s\n"
- ".inst 0x64abe59c // fmmla z28.s, z12.s, z11.s\n"
- ".inst 0x64abe5bd // fmmla z29.s, z13.s, z11.s\n"
- ".inst 0x64abe5de // fmmla z30.s, z14.s, z11.s\n"
- ".inst 0x64abe5ff // fmmla z31.s, z15.s, z11.s\n"
- "5:\n"
- "ld1rw z14.s, p7/z, [%[minptr]]\n"
- "ld1rw z15.s, p7/z, [%[maxptr]]\n"
- "fmax z16.s, p7/m, z16.s, z14.s\n"
- "fmax z17.s, p7/m, z17.s, z14.s\n"
- "fmax z18.s, p7/m, z18.s, z14.s\n"
- "fmax z19.s, p7/m, z19.s, z14.s\n"
- "fmin z16.s, p7/m, z16.s, z15.s\n"
- "fmin z17.s, p7/m, z17.s, z15.s\n"
- "fmin z18.s, p7/m, z18.s, z15.s\n"
- "fmin z19.s, p7/m, z19.s, z15.s\n"
- "fmax z20.s, p7/m, z20.s, z14.s\n"
- "uzp1 z0.s, z16.s, z17.s\n"
- "uzp2 z1.s, z16.s, z17.s\n"
- "uzp1 z2.s, z18.s, z19.s\n"
- "uzp2 z3.s, z18.s, z19.s\n"
- "st1w z0.s, p0, [%[c_ptr0]]\n"
- "fmin z20.s, p7/m, z20.s, z15.s\n"
- "fmax z21.s, p7/m, z21.s, z14.s\n"
- "fmax z22.s, p7/m, z22.s, z14.s\n"
- "st1w z1.s, p0, [c_ptr1]\n"
- "fmax z23.s, p7/m, z23.s, z14.s\n"
- "fmax z24.s, p7/m, z24.s, z14.s\n"
- "fmin z21.s, p7/m, z21.s, z15.s\n"
- "st1w z2.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "fmin z22.s, p7/m, z22.s, z15.s\n"
- "addvl %[c_ptr0], %[c_ptr0], #2\n"
- "fmin z23.s, p7/m, z23.s, z15.s\n"
- "st1w z3.s, p1, [c_ptr1, #1, MUL VL]\n"
- "uzp1 z4.s, z20.s, z21.s\n"
- "uzp2 z5.s, z20.s, z21.s\n"
- "fmin z24.s, p7/m, z24.s, z15.s\n"
- "uzp1 z6.s, z22.s, z23.s\n"
- "st1w z4.s, p0, [c_ptr2]\n"
- "uzp2 z7.s, z22.s, z23.s\n"
- "fmax z25.s, p7/m, z25.s, z14.s\n"
- "fmax z26.s, p7/m, z26.s, z14.s\n"
- "st1w z5.s, p0, [c_ptr3]\n"
- "fmax z27.s, p7/m, z27.s, z14.s\n"
- "fmax z28.s, p7/m, z28.s, z14.s\n"
- "fmin z25.s, p7/m, z25.s, z15.s\n"
- "st1w z6.s, p1, [c_ptr2, #1, MUL VL]\n"
- "fmin z26.s, p7/m, z26.s, z15.s\n"
- "fmin z27.s, p7/m, z27.s, z15.s\n"
- "fmin z28.s, p7/m, z28.s, z15.s\n"
- "st1w z7.s, p1, [c_ptr3, #1, MUL VL]\n"
- "uzp1 z8.s, z24.s, z25.s\n"
- "uzp2 z9.s, z24.s, z25.s\n"
- "uzp1 z10.s, z26.s, z27.s\n"
- "uzp2 z11.s, z26.s, z27.s\n"
- "st1w z8.s, p0, [c_ptr4]\n"
- "fmax z29.s, p7/m, z29.s, z14.s\n"
- "fmax z30.s, p7/m, z30.s, z14.s\n"
- "fmax z31.s, p7/m, z31.s, z14.s\n"
- "st1w z9.s, p0, [c_ptr5]\n"
- "fmin z29.s, p7/m, z29.s, z15.s\n"
- "fmin z30.s, p7/m, z30.s, z15.s\n"
- "st1w z10.s, p1, [c_ptr4, #1, MUL VL]\n"
- "fmin z31.s, p7/m, z31.s, z15.s\n"
- "uzp1 z12.s, z28.s, z29.s\n"
- "st1w z11.s, p1, [c_ptr5, #1, MUL VL]\n"
- "uzp2 z13.s, z28.s, z29.s\n"
- "uzp1 z14.s, z30.s, z31.s\n"
- "uzp2 z15.s, z30.s, z31.s\n"
- "st1w z12.s, p0, [c_ptr6]\n"
- "st1w z13.s, p0, [c_ptr7]\n"
- "st1w z14.s, p1, [c_ptr6, #1, MUL VL]\n"
- "st1w z15.s, p1, [c_ptr7, #1, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq a_ptr4\n"
- ".unreq a_ptr5\n"
- ".unreq a_ptr6\n"
- ".unreq a_ptr7\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- ".unreq c_ptr4\n"
- ".unreq c_ptr5\n"
- ".unreq c_ptr6\n"
- ".unreq c_ptr7\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [biasptr] "r" (biasptr), [minptr] "r" (minptr), [maxptr] "r" (maxptr), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7", "x8", "x9", "x10", "x11", "x12", "x13", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
index c500f43fe0..0150ce8fd9 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,37 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-#include <cstdint>
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_hybrid_s8qa_dot_4x4VL( ARGLIST );
-class hybrid_s8s32_dot_4VLx4
+class cls_sve_hybrid_s8qa_dot_4x4VL
{
public:
typedef int8_t operand_type;
- typedef int32_t result_type;
+ typedef int8_t result_type;
- typedef void (*kern_type)(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -60,30 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_s8s32_dot_4VLx4;
+ kern_type kernel=sve_hybrid_s8qa_dot_4x4VL;
- hybrid_s8s32_dot_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_s8qa_dot_4x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..2b1448bd65
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qa_dot_4x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 46f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 31f\n"
+ "beq 16f\n"
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "ble 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ "bgt 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 12f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "12:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 13f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z1.s }, p2/Z, [x19]\n"
+ "neg z1.s, p2/M, z1.s\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z1.s\n"
+ "13:" // Height 1: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z17.s, z17.s, z1.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ "tbz %x[flags], #5, 14f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "14:" // Height 1: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "15:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 3b\n"
+ "b 62f\n"
+ "16:" // Height 2
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 17f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 18f\n"
+ "17:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "18:" // Height 2: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "19:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "20:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 21f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 22f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 22f\n"
+ "21:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "22:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "ble 25f\n"
+ "23:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "tbnz %x[flags], #31, 24f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "24:" // Height 2: Multiply loop: unique 3: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "bgt 23b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z7.b, z1.b[0]\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z22.s, z8.b, z1.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "sdot z23.s, z9.b, z1.b[0]\n"
+ "ble 26f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "sdot z22.s, z5.b, z1.b[1]\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "sdot z23.s, z6.b, z1.b[1]\n"
+ "ble 26f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z8.b, z1.b[2]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z9.b, z1.b[2]\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "sdot z23.s, z10.b, z1.b[2]\n"
+ "ble 26f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "sdot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z5.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z22.s, z6.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "sdot z23.s, z7.b, z1.b[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 27f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "27:" // Height 2: Multiply loop: unique 4: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 20b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 28f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z2.s }, p2/Z, [x19]\n"
+ "neg z2.s, p2/M, z2.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "saddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d12, p0, z12.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mov z12.s, z12.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z2.s\n"
+ "mul z12.s, p2/M, z12.s, z2.s\n"
+ "28:" // Height 2: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ "tbz %x[flags], #5, 29f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "29:" // Height 2: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "30:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 18b\n"
+ "b 62f\n"
+ "31:" // Height 3
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 32f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 33f\n"
+ "32:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "33:" // Height 3: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "37:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "ble 40f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z26.s, z6.b, z2.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "sdot z27.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "sdot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z25.s, z9.b, z2.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "sdot z26.s, z10.b, z2.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "sdot z27.s, z4.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "sdot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z25.s, z6.b, z2.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z27.s, z8.b, z2.b[2]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "sdot z24.s, z9.b, z2.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z25.s, z10.b, z2.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z26.s, z4.b, z2.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z27.s, z5.b, z2.b[3]\n"
+ "tbnz %x[flags], #31, 39f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "39:" // Height 3: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bgt 38b\n"
+ "40:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z21.s, z7.b, z1.b[0]\n"
+ "sdot z25.s, z7.b, z2.b[0]\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z22.s, z8.b, z1.b[0]\n"
+ "sdot z26.s, z8.b, z2.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "sdot z23.s, z9.b, z1.b[0]\n"
+ "sdot z27.s, z9.b, z2.b[0]\n"
+ "ble 41f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z25.s, z4.b, z2.b[1]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "sdot z22.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z2.b[1]\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "sdot z23.s, z6.b, z1.b[1]\n"
+ "sdot z27.s, z6.b, z2.b[1]\n"
+ "ble 41f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "sdot z21.s, z8.b, z1.b[2]\n"
+ "sdot z25.s, z8.b, z2.b[2]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z9.b, z1.b[2]\n"
+ "sdot z26.s, z9.b, z2.b[2]\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "sdot z23.s, z10.b, z1.b[2]\n"
+ "sdot z27.s, z10.b, z2.b[2]\n"
+ "ble 41f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "sdot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "sdot z21.s, z5.b, z1.b[3]\n"
+ "sdot z25.s, z5.b, z2.b[3]\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z22.s, z6.b, z1.b[3]\n"
+ "sdot z26.s, z6.b, z2.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "sdot z23.s, z7.b, z1.b[3]\n"
+ "sdot z27.s, z7.b, z2.b[3]\n"
+ "41:" // Height 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 42f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "42:" // Height 3: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 43f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z3.s }, p2/Z, [x19]\n"
+ "neg z3.s, p2/M, z3.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "saddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov z12.s, z12.s[0]\n"
+ "saddv d13, p0, z13.s\n"
+ "mul z11.s, p2/M, z11.s, z3.s\n"
+ "mul z12.s, p2/M, z12.s, z3.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z3.s\n"
+ "43:" // Height 3: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "tbz %x[flags], #5, 44f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "44:" // Height 3: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "45:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 33b\n"
+ "b 62f\n"
+ "46:" // Height 4
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 47f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 48f\n"
+ "47:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "48:" // Height 4: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "49:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "50:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 52f\n"
+ "51:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "52:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "ble 55f\n"
+ "53:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z4.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "sdot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "sdot z28.s, z4.b, z3.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "sdot z29.s, z5.b, z3.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "sdot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "sdot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "sdot z26.s, z6.b, z2.b[0]\n"
+ "sdot z30.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "sdot z19.s, z7.b, z0.b[0]\n"
+ "sdot z23.s, z7.b, z1.b[0]\n"
+ "sdot z27.s, z7.b, z2.b[0]\n"
+ "sdot z31.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "sdot z16.s, z8.b, z0.b[1]\n"
+ "sdot z20.s, z8.b, z1.b[1]\n"
+ "sdot z24.s, z8.b, z2.b[1]\n"
+ "sdot z28.s, z8.b, z3.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "sdot z17.s, z9.b, z0.b[1]\n"
+ "sdot z21.s, z9.b, z1.b[1]\n"
+ "sdot z25.s, z9.b, z2.b[1]\n"
+ "sdot z29.s, z9.b, z3.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "sdot z18.s, z10.b, z0.b[1]\n"
+ "sdot z22.s, z10.b, z1.b[1]\n"
+ "sdot z26.s, z10.b, z2.b[1]\n"
+ "sdot z30.s, z10.b, z3.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "sdot z19.s, z4.b, z0.b[1]\n"
+ "sdot z23.s, z4.b, z1.b[1]\n"
+ "sdot z27.s, z4.b, z2.b[1]\n"
+ "sdot z31.s, z4.b, z3.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "sdot z16.s, z5.b, z0.b[2]\n"
+ "sdot z20.s, z5.b, z1.b[2]\n"
+ "sdot z24.s, z5.b, z2.b[2]\n"
+ "sdot z28.s, z5.b, z3.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "sdot z17.s, z6.b, z0.b[2]\n"
+ "sdot z21.s, z6.b, z1.b[2]\n"
+ "sdot z25.s, z6.b, z2.b[2]\n"
+ "sdot z29.s, z6.b, z3.b[2]\n"
+ "sdot z18.s, z7.b, z0.b[2]\n"
+ "sdot z22.s, z7.b, z1.b[2]\n"
+ "sdot z26.s, z7.b, z2.b[2]\n"
+ "sdot z30.s, z7.b, z3.b[2]\n"
+ "sdot z19.s, z8.b, z0.b[2]\n"
+ "sdot z23.s, z8.b, z1.b[2]\n"
+ "sdot z27.s, z8.b, z2.b[2]\n"
+ "sdot z31.s, z8.b, z3.b[2]\n"
+ "sdot z16.s, z9.b, z0.b[3]\n"
+ "sdot z20.s, z9.b, z1.b[3]\n"
+ "sdot z24.s, z9.b, z2.b[3]\n"
+ "sdot z28.s, z9.b, z3.b[3]\n"
+ "sdot z17.s, z10.b, z0.b[3]\n"
+ "sdot z21.s, z10.b, z1.b[3]\n"
+ "sdot z25.s, z10.b, z2.b[3]\n"
+ "sdot z29.s, z10.b, z3.b[3]\n"
+ "sdot z18.s, z4.b, z0.b[3]\n"
+ "sdot z22.s, z4.b, z1.b[3]\n"
+ "sdot z26.s, z4.b, z2.b[3]\n"
+ "sdot z30.s, z4.b, z3.b[3]\n"
+ "sdot z19.s, z5.b, z0.b[3]\n"
+ "sdot z23.s, z5.b, z1.b[3]\n"
+ "sdot z27.s, z5.b, z2.b[3]\n"
+ "sdot z31.s, z5.b, z3.b[3]\n"
+ "tbnz %x[flags], #31, 54f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "sdot z14.s, z3.b, z15.b\n"
+ "54:" // Height 4: Multiply loop: unique 7: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 53b\n"
+ "55:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "sdot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z20.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z21.s, z7.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z6.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z2.b[0]\n"
+ "sdot z29.s, z7.b, z3.b[0]\n"
+ "sdot z18.s, z8.b, z0.b[0]\n"
+ "sdot z22.s, z8.b, z1.b[0]\n"
+ "sdot z26.s, z8.b, z2.b[0]\n"
+ "sdot z30.s, z8.b, z3.b[0]\n"
+ "sdot z19.s, z9.b, z0.b[0]\n"
+ "sdot z23.s, z9.b, z1.b[0]\n"
+ "sdot z27.s, z9.b, z2.b[0]\n"
+ "sdot z31.s, z9.b, z3.b[0]\n"
+ "ble 56f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z10.b, z3.b[1]\n"
+ "sdot z17.s, z4.b, z0.b[1]\n"
+ "sdot z21.s, z4.b, z1.b[1]\n"
+ "sdot z25.s, z4.b, z2.b[1]\n"
+ "sdot z29.s, z4.b, z3.b[1]\n"
+ "sdot z18.s, z5.b, z0.b[1]\n"
+ "sdot z22.s, z5.b, z1.b[1]\n"
+ "sdot z26.s, z5.b, z2.b[1]\n"
+ "sdot z30.s, z5.b, z3.b[1]\n"
+ "sdot z19.s, z6.b, z0.b[1]\n"
+ "sdot z23.s, z6.b, z1.b[1]\n"
+ "sdot z27.s, z6.b, z2.b[1]\n"
+ "sdot z31.s, z6.b, z3.b[1]\n"
+ "ble 56f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "sdot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z7.b, z3.b[2]\n"
+ "sdot z17.s, z8.b, z0.b[2]\n"
+ "sdot z21.s, z8.b, z1.b[2]\n"
+ "sdot z25.s, z8.b, z2.b[2]\n"
+ "sdot z29.s, z8.b, z3.b[2]\n"
+ "sdot z18.s, z9.b, z0.b[2]\n"
+ "sdot z22.s, z9.b, z1.b[2]\n"
+ "sdot z26.s, z9.b, z2.b[2]\n"
+ "sdot z30.s, z9.b, z3.b[2]\n"
+ "sdot z19.s, z10.b, z0.b[2]\n"
+ "sdot z23.s, z10.b, z1.b[2]\n"
+ "sdot z27.s, z10.b, z2.b[2]\n"
+ "sdot z31.s, z10.b, z3.b[2]\n"
+ "ble 56f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "sdot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "sdot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "sdot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "sdot z28.s, z4.b, z3.b[3]\n"
+ "sdot z17.s, z5.b, z0.b[3]\n"
+ "sdot z21.s, z5.b, z1.b[3]\n"
+ "sdot z25.s, z5.b, z2.b[3]\n"
+ "sdot z29.s, z5.b, z3.b[3]\n"
+ "sdot z18.s, z6.b, z0.b[3]\n"
+ "sdot z22.s, z6.b, z1.b[3]\n"
+ "sdot z26.s, z6.b, z2.b[3]\n"
+ "sdot z30.s, z6.b, z3.b[3]\n"
+ "sdot z19.s, z7.b, z0.b[3]\n"
+ "sdot z23.s, z7.b, z1.b[3]\n"
+ "sdot z27.s, z7.b, z2.b[3]\n"
+ "sdot z31.s, z7.b, z3.b[3]\n"
+ "56:" // Height 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 57f\n"
+ "sdot z11.s, z0.b, z15.b\n"
+ "sdot z12.s, z1.b, z15.b\n"
+ "sdot z13.s, z2.b, z15.b\n"
+ "sdot z14.s, z3.b, z15.b\n"
+ "57:" // Height 4: Multiply loop: unique 8: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 58f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "neg z4.s, p2/M, z4.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "saddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "saddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov x19, #0x4\n"
+ "mov z12.s, z12.s[0]\n"
+ "saddv d13, p0, z13.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mul z11.s, p2/M, z11.s, z4.s\n"
+ "saddv d14, p0, z14.s\n"
+ "mul z12.s, p2/M, z12.s, z4.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z4.s\n"
+ "mov z14.s, z14.s[0]\n"
+ "mul z14.s, p2/M, z14.s, z4.s\n"
+ "58:" // Height 4: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z31.s, z31.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
+ ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
+ ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ "tbz %x[flags], #5, 59f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "and z9.d, z28.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "and z10.d, z29.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "and z4.d, z30.d, z0.d\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "and z5.d, z31.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z9.s\n"
+ "sqadd z29.s, z29.s, z10.s\n"
+ "sqadd z30.s, z30.s, z4.s\n"
+ "sqadd z31.s, z31.s, z5.s\n"
+ "59:" // Height 4: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "add z28.s, z28.s, z4.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
+ ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "smax z28.s, p2/M, z28.s, z5.s\n"
+ "add z29.s, z29.s, z4.s\n"
+ "add z30.s, z30.s, z4.s\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "smin z29.s, p2/M, z29.s, z6.s\n"
+ "addvl x23, x23, #1\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "smax z29.s, p2/M, z29.s, z5.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "smax z30.s, p2/M, z30.s, z5.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smax z31.s, p2/M, z31.s, z5.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "60:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 48b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 62f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 61f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "61:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "62:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
new file mode 100644
index 0000000000..d8562898aa
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8qs_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8qs_dot_6x4VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int8_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return false;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_s8qs_dot_6x4VL;
+
+ cls_sve_hybrid_s8qs_dot_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..4a4af6356c
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8qs_dot_6x4VL/generic.cpp
@@ -0,0 +1,2770 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8qs_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int col_base
+)
+{
+ struct KernelArgs {
+ const int32_t *multiplier_ptr = {};
+ const int32_t *shift_ptr = {};
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->per_channel_requant) {
+ flags |= 0x10;
+ ka.multiplier_ptr=qp->per_channel_muls + col_base;
+ ka.shift_ptr=qp->per_channel_right_shifts + col_base;
+ }
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 71f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 57f\n"
+ "beq 43f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 29f\n"
+ "beq 15f\n"
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "ble 9f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "bgt 8b\n"
+ "9:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ble 10f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "10:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z9.s, z9.s, z1.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "tbz %x[flags], #4, 11f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 12f\n"
+ "11:" // Height 1: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "12:" // Height 1: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ "tbz %x[flags], #5, 13f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "13:" // Height 1: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "addvl x13, x13, #1\n"
+ "14:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 3b\n"
+ "b 86f\n"
+ "15:" // Height 2
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 16f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "b 17f\n"
+ "16:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "17:" // Height 2: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "18:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "19:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 21f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "21:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "ble 23f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "bgt 22b\n"
+ "23:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ble 24f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ble 24f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ble 24f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "24:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 19b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "add z9.s, z9.s, z1.s\n"
+ "addvl x16, x16, #4\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "tbz %x[flags], #4, 25f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 26f\n"
+ "25:" // Height 2: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "26:" // Height 2: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ "tbz %x[flags], #5, 27f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "27:" // Height 2: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "28:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 17b\n"
+ "b 86f\n"
+ "29:" // Height 3
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 30f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "add x27, x27, x19\n"
+ "b 31f\n"
+ "30:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "31:" // Height 3: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "32:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "33:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 34f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 35f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 35f\n"
+ "34:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "35:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "ble 37f\n"
+ "36:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "bgt 36b\n"
+ "37:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ble 38f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ble 38f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ble 38f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "38:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 33b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "tbz %x[flags], #4, 39f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 40f\n"
+ "39:" // Height 3: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "40:" // Height 3: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ "tbz %x[flags], #5, 41f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "41:" // Height 3: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "addvl x27, x27, #1\n"
+ "42:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 31b\n"
+ "b 86f\n"
+ "43:" // Height 4
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 44f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "b 45f\n"
+ "44:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "45:" // Height 4: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "46:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "47:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 48f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 49f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 49f\n"
+ "48:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "49:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "ble 51f\n"
+ "50:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "bgt 50b\n"
+ "51:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ble 52f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ble 52f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ble 52f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "52:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 47b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "tbz %x[flags], #4, 53f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 54f\n"
+ "53:" // Height 4: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "54:" // Height 4: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n"
+ ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
+ ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
+ "tbz %x[flags], #5, 55f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "and z4.d, z20.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "and z5.d, z21.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "and z6.d, z22.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z4.s\n"
+ "and z7.d, z23.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z5.s\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "sqadd z23.s, z23.s, z7.s\n"
+ "55:" // Height 4: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
+ ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "add z21.s, z21.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "addvl x27, x27, #1\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "56:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 45b\n"
+ "b 86f\n"
+ "57:" // Height 5
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 58f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 59f\n"
+ "58:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "59:" // Height 5: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "60:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "61:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 62f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 63f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 63f\n"
+ "62:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "63:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "ble 65f\n"
+ "64:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "bgt 64b\n"
+ "65:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ble 66f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ble 66f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ble 66f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "66:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 61b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "tbz %x[flags], #4, 67f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 68f\n"
+ "67:" // Height 5: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "68:" // Height 5: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n"
+ ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
+ ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n"
+ ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
+ ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
+ "tbz %x[flags], #5, 69f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "and z4.d, z20.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "and z5.d, z21.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "and z6.d, z22.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z4.s\n"
+ "and z7.d, z23.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z5.s\n"
+ "and z4.d, z24.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "and z5.d, z25.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z7.s\n"
+ "and z6.d, z26.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z4.s\n"
+ "and z7.d, z27.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z5.s\n"
+ "sqadd z26.s, z26.s, z6.s\n"
+ "sqadd z27.s, z27.s, z7.s\n"
+ "69:" // Height 5: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
+ ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "add z21.s, z21.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "addvl x27, x27, #1\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z26.s, z26.s, z4.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z27.s, z27.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "70:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 59b\n"
+ "b 86f\n"
+ "71:" // Height 6
+ "ldr x8, [%x[args_ptr], %[offsetof_multiplier_ptr]]\n"
+ "mov x16, %x[col_bias]\n"
+ "ldr x17, [%x[args_ptr], %[offsetof_shift_ptr]]\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 72f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 73f\n"
+ "72:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19\n"
+ "add x27, x9, x19\n"
+ "add x25, x27, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "73:" // Height 6: Column loop
+ "mov z8.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "whilelt p1.b, x19, x15\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "74:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "75:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 76f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 77f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 77f\n"
+ "76:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "77:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "ble 79f\n"
+ "78:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "bgt 78b\n"
+ "79:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ble 80f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ble 80f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ble 80f\n"
+ "ld1b { z6.b }, p2/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "80:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 75b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "ld1w { z0.s }, p2/Z, [x16]\n"
+ "add z8.s, z8.s, z0.s\n"
+ "ld1w { z1.s }, p2/Z, [x16, #1, MUL VL]\n"
+ "add z12.s, z12.s, z0.s\n"
+ "ld1w { z2.s }, p2/Z, [x16, #2, MUL VL]\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1w { z3.s }, p2/Z, [x16, #3, MUL VL]\n"
+ "addvl x16, x16, #4\n"
+ "add z9.s, z9.s, z1.s\n"
+ "add z13.s, z13.s, z1.s\n"
+ "add z10.s, z10.s, z2.s\n"
+ "add z11.s, z11.s, z3.s\n"
+ "add z14.s, z14.s, z2.s\n"
+ "add z15.s, z15.s, z3.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z31.s, z31.s, z3.s\n"
+ "tbz %x[flags], #4, 81f\n"
+ "ld1w { z0.s }, p2/Z, [x17]\n"
+ "ld1w { z4.s }, p2/Z, [x8]\n"
+ "ld1w { z1.s }, p2/Z, [x17, #1, MUL VL]\n"
+ "ld1w { z5.s }, p2/Z, [x8, #1, MUL VL]\n"
+ "ld1w { z2.s }, p2/Z, [x17, #2, MUL VL]\n"
+ "ld1w { z6.s }, p2/Z, [x8, #2, MUL VL]\n"
+ "ld1w { z3.s }, p2/Z, [x17, #3, MUL VL]\n"
+ "addvl x17, x17, #4\n"
+ "ld1w { z7.s }, p2/Z, [x8, #3, MUL VL]\n"
+ "addvl x8, x8, #4\n"
+ "b 82f\n"
+ "81:" // Height 6: per layer parameters
+ "add x19, %x[qp], %[per_layer_right_shift]\n"
+ "ld1rw { z0.s }, p2/Z, [x19]\n"
+ "mov z1.d, z0.d\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "mov z2.d, z0.d\n"
+ "mov z3.d, z0.d\n"
+ "mov z5.d, z4.d\n"
+ "mov z6.d, z4.d\n"
+ "mov z7.d, z4.d\n"
+ "82:" // Height 6: parameters loaded
+ ".inst 0x04a47508 // sqrdmulh z8.s, z8.s, z4.s\n"
+ ".inst 0x04a57529 // sqrdmulh z9.s, z9.s, z5.s\n"
+ ".inst 0x04a6754a // sqrdmulh z10.s, z10.s, z6.s\n"
+ ".inst 0x04a7756b // sqrdmulh z11.s, z11.s, z7.s\n"
+ ".inst 0x04a4758c // sqrdmulh z12.s, z12.s, z4.s\n"
+ ".inst 0x04a575ad // sqrdmulh z13.s, z13.s, z5.s\n"
+ ".inst 0x04a675ce // sqrdmulh z14.s, z14.s, z6.s\n"
+ ".inst 0x04a775ef // sqrdmulh z15.s, z15.s, z7.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a57631 // sqrdmulh z17.s, z17.s, z5.s\n"
+ ".inst 0x04a67652 // sqrdmulh z18.s, z18.s, z6.s\n"
+ ".inst 0x04a77673 // sqrdmulh z19.s, z19.s, z7.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a576b5 // sqrdmulh z21.s, z21.s, z5.s\n"
+ ".inst 0x04a676d6 // sqrdmulh z22.s, z22.s, z6.s\n"
+ ".inst 0x04a776f7 // sqrdmulh z23.s, z23.s, z7.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a57739 // sqrdmulh z25.s, z25.s, z5.s\n"
+ ".inst 0x04a6775a // sqrdmulh z26.s, z26.s, z6.s\n"
+ ".inst 0x04a7777b // sqrdmulh z27.s, z27.s, z7.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a577bd // sqrdmulh z29.s, z29.s, z5.s\n"
+ ".inst 0x04a677de // sqrdmulh z30.s, z30.s, z6.s\n"
+ ".inst 0x04a777ff // sqrdmulh z31.s, z31.s, z7.s\n"
+ "tbz %x[flags], #5, 83f\n"
+ "and z4.d, z8.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z9.d, z1.d\n"
+ "and z6.d, z10.d, z2.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z11.d, z3.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z8.s, z8.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z4.d, z12.d, z0.d\n"
+ "sqadd z9.s, z9.s, z5.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z10.s, z10.s, z6.s\n"
+ "and z5.d, z13.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z11.s, z11.s, z7.s\n"
+ "and z6.d, z14.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z12.s, z12.s, z4.s\n"
+ "and z7.d, z15.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z13.s, z13.s, z5.s\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z14.s, z14.s, z6.s\n"
+ "and z5.d, z17.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z15.s, z15.s, z7.s\n"
+ "and z6.d, z18.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z7.d, z19.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "and z4.d, z20.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "and z5.d, z21.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "and z6.d, z22.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z20.s, z20.s, z4.s\n"
+ "and z7.d, z23.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z21.s, z21.s, z5.s\n"
+ "and z4.d, z24.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z22.s, z22.s, z6.s\n"
+ "and z5.d, z25.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z23.s, z23.s, z7.s\n"
+ "and z6.d, z26.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z4.s\n"
+ "and z7.d, z27.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z5.s\n"
+ "and z4.d, z28.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z6.s\n"
+ "and z5.d, z29.d, z1.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z7.s\n"
+ "and z6.d, z30.d, z2.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z4.s\n"
+ "and z7.d, z31.d, z3.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z29.s, z29.s, z5.s\n"
+ "sqadd z30.s, z30.s, z6.s\n"
+ "sqadd z31.s, z31.s, z7.s\n"
+ "83:" // Height 6: no shift correction
+ ".inst 0x44828808 // srshl z8.s, p2/M, z8.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828829 // srshl z9.s, p2/M, z9.s, z1.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x4482884a // srshl z10.s, p2/M, z10.s, z2.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x4482886b // srshl z11.s, p2/M, z11.s, z3.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x4482880c // srshl z12.s, p2/M, z12.s, z0.s\n"
+ "add z8.s, z8.s, z4.s\n"
+ "add z9.s, z9.s, z4.s\n"
+ "add z10.s, z10.s, z4.s\n"
+ "add z11.s, z11.s, z4.s\n"
+ "add z12.s, z12.s, z4.s\n"
+ "smin z8.s, p2/M, z8.s, z6.s\n"
+ "smin z9.s, p2/M, z9.s, z6.s\n"
+ "smin z10.s, p2/M, z10.s, z6.s\n"
+ "smin z11.s, p2/M, z11.s, z6.s\n"
+ "smax z8.s, p2/M, z8.s, z5.s\n"
+ "smax z9.s, p2/M, z9.s, z5.s\n"
+ "smax z10.s, p2/M, z10.s, z5.s\n"
+ "smax z11.s, p2/M, z11.s, z5.s\n"
+ "smin z12.s, p2/M, z12.s, z6.s\n"
+ "uzp1 z8.h, z8.h, z9.h\n"
+ ".inst 0x4482882d // srshl z13.s, p2/M, z13.s, z1.s\n"
+ "uzp1 z9.h, z10.h, z11.h\n"
+ "smax z12.s, p2/M, z12.s, z5.s\n"
+ "uzp1 z8.b, z8.b, z9.b\n"
+ "st1b { z8.b }, p1, [x13]\n"
+ "add z13.s, z13.s, z4.s\n"
+ "addvl x13, x13, #1\n"
+ ".inst 0x4482884e // srshl z14.s, p2/M, z14.s, z2.s\n"
+ ".inst 0x4482886f // srshl z15.s, p2/M, z15.s, z3.s\n"
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "smin z13.s, p2/M, z13.s, z6.s\n"
+ ".inst 0x44828831 // srshl z17.s, p2/M, z17.s, z1.s\n"
+ "add z14.s, z14.s, z4.s\n"
+ "add z15.s, z15.s, z4.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "smax z13.s, p2/M, z13.s, z5.s\n"
+ "smin z14.s, p2/M, z14.s, z6.s\n"
+ "smin z15.s, p2/M, z15.s, z6.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "uzp1 z12.h, z12.h, z13.h\n"
+ "smax z14.s, p2/M, z14.s, z5.s\n"
+ "smax z15.s, p2/M, z15.s, z5.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ ".inst 0x44828852 // srshl z18.s, p2/M, z18.s, z2.s\n"
+ "uzp1 z13.h, z14.h, z15.h\n"
+ ".inst 0x44828873 // srshl z19.s, p2/M, z19.s, z3.s\n"
+ "uzp1 z12.b, z12.b, z13.b\n"
+ "st1b { z12.b }, p1, [x9]\n"
+ "add z18.s, z18.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ ".inst 0x44828835 // srshl z21.s, p2/M, z21.s, z1.s\n"
+ ".inst 0x44828856 // srshl z22.s, p2/M, z22.s, z2.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "add z21.s, z21.s, z4.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x27]\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "addvl x27, x27, #1\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ ".inst 0x44828877 // srshl z23.s, p2/M, z23.s, z3.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ ".inst 0x44828839 // srshl z25.s, p2/M, z25.s, z1.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482885a // srshl z26.s, p2/M, z26.s, z2.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "add z26.s, z26.s, z4.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482887b // srshl z27.s, p2/M, z27.s, z3.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z27.s, z27.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
+ ".inst 0x4482883d // srshl z29.s, p2/M, z29.s, z1.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ ".inst 0x4482885e // srshl z30.s, p2/M, z30.s, z2.s\n"
+ "add z28.s, z28.s, z4.s\n"
+ "add z29.s, z29.s, z4.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "add z30.s, z30.s, z4.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ "smin z29.s, p2/M, z29.s, z6.s\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "smax z28.s, p2/M, z28.s, z5.s\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "smax z29.s, p2/M, z29.s, z5.s\n"
+ "addvl x23, x23, #1\n"
+ "smax z30.s, p2/M, z30.s, z5.s\n"
+ ".inst 0x4482887f // srshl z31.s, p2/M, z31.s, z3.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "add z31.s, z31.s, z4.s\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smax z31.s, p2/M, z31.s, z5.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "84:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 73b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 86f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 85f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "85:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "86:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [flags] "r" (flags), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_multiplier_ptr] "I" (offsetof(KernelArgs, multiplier_ptr)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_shift_ptr] "I" (offsetof(KernelArgs, shift_ptr)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x16", "x17", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
deleted file mode 100644
index b30b8845a6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_s8s32_dot_4VLx4(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const int8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(int8_t);
-
- int32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<int32_t>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<int32_t>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const int8_t *a_ptr0 = a_ptr0_base;
- const int8_t *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(int32_t);
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z18.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z19.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z19.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z20.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z26.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z27.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "mov z20.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z21.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z26.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z27.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z28.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "mov z29.s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "mov z30.s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z31.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "sdot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "sdot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "sdot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z28.s, z8.b, z7.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z30.s, z10.b, z7.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z31.s, z11.b, z7.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z28.s, z12.b, z7.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z30.s, z14.b, z7.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "sdot z31.s, z15.b, z7.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "sdot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "sdot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "sdot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z28.s, z8.b, z7.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z30.s, z10.b, z7.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z31.s, z11.b, z7.b[2]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z28.s, z12.b, z7.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z30.s, z14.b, z7.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "sdot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "b 5f\n"
- "4:\n"
- "sdot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "sdot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "sdot z28.s, z8.b, z3.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "sdot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "sdot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
- "sdot z25.s, z9.b, z2.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "sdot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "sdot z18.s, z10.b, z0.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "sdot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "sdot z26.s, z10.b, z2.b[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "sdot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z19.s, z11.b, z0.b[0]\n"
- "sdot z23.s, z11.b, z1.b[0]\n"
- "sdot z27.s, z11.b, z2.b[0]\n"
- "sdot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z16.s, z12.b, z0.b[1]\n"
- "sdot z20.s, z12.b, z1.b[1]\n"
- "sdot z24.s, z12.b, z2.b[1]\n"
- "sdot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "sdot z17.s, z13.b, z0.b[1]\n"
- "sdot z21.s, z13.b, z1.b[1]\n"
- "sdot z25.s, z13.b, z2.b[1]\n"
- "sdot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "sdot z18.s, z14.b, z0.b[1]\n"
- "sdot z22.s, z14.b, z1.b[1]\n"
- "sdot z26.s, z14.b, z2.b[1]\n"
- "sdot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z19.s, z15.b, z0.b[1]\n"
- "sdot z23.s, z15.b, z1.b[1]\n"
- "sdot z27.s, z15.b, z2.b[1]\n"
- "sdot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "sdot z20.s, z8.b, z1.b[2]\n"
- "sdot z24.s, z8.b, z2.b[2]\n"
- "sdot z28.s, z8.b, z3.b[2]\n"
- "sdot z17.s, z9.b, z0.b[2]\n"
- "sdot z21.s, z9.b, z1.b[2]\n"
- "sdot z25.s, z9.b, z2.b[2]\n"
- "sdot z29.s, z9.b, z3.b[2]\n"
- "sdot z18.s, z10.b, z0.b[2]\n"
- "sdot z22.s, z10.b, z1.b[2]\n"
- "sdot z26.s, z10.b, z2.b[2]\n"
- "sdot z30.s, z10.b, z3.b[2]\n"
- "sdot z19.s, z11.b, z0.b[2]\n"
- "sdot z23.s, z11.b, z1.b[2]\n"
- "sdot z27.s, z11.b, z2.b[2]\n"
- "sdot z31.s, z11.b, z3.b[2]\n"
- "sdot z16.s, z12.b, z0.b[3]\n"
- "sdot z20.s, z12.b, z1.b[3]\n"
- "sdot z24.s, z12.b, z2.b[3]\n"
- "sdot z28.s, z12.b, z3.b[3]\n"
- "sdot z17.s, z13.b, z0.b[3]\n"
- "sdot z21.s, z13.b, z1.b[3]\n"
- "sdot z25.s, z13.b, z2.b[3]\n"
- "sdot z29.s, z13.b, z3.b[3]\n"
- "sdot z18.s, z14.b, z0.b[3]\n"
- "sdot z22.s, z14.b, z1.b[3]\n"
- "sdot z26.s, z14.b, z2.b[3]\n"
- "sdot z30.s, z14.b, z3.b[3]\n"
- "sdot z19.s, z15.b, z0.b[3]\n"
- "sdot z23.s, z15.b, z1.b[3]\n"
- "sdot z27.s, z15.b, z2.b[3]\n"
- "sdot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "sdot z20.s, z8.b, z5.b[0]\n"
- "sdot z24.s, z8.b, z6.b[0]\n"
- "sdot z28.s, z8.b, z7.b[0]\n"
- "sdot z17.s, z9.b, z4.b[0]\n"
- "sdot z21.s, z9.b, z5.b[0]\n"
- "sdot z25.s, z9.b, z6.b[0]\n"
- "sdot z29.s, z9.b, z7.b[0]\n"
- "sdot z18.s, z10.b, z4.b[0]\n"
- "sdot z22.s, z10.b, z5.b[0]\n"
- "sdot z26.s, z10.b, z6.b[0]\n"
- "sdot z30.s, z10.b, z7.b[0]\n"
- "sdot z19.s, z11.b, z4.b[0]\n"
- "sdot z23.s, z11.b, z5.b[0]\n"
- "sdot z27.s, z11.b, z6.b[0]\n"
- "sdot z31.s, z11.b, z7.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "sdot z20.s, z12.b, z5.b[1]\n"
- "sdot z24.s, z12.b, z6.b[1]\n"
- "sdot z28.s, z12.b, z7.b[1]\n"
- "sdot z17.s, z13.b, z4.b[1]\n"
- "sdot z21.s, z13.b, z5.b[1]\n"
- "sdot z25.s, z13.b, z6.b[1]\n"
- "sdot z29.s, z13.b, z7.b[1]\n"
- "sdot z18.s, z14.b, z4.b[1]\n"
- "sdot z22.s, z14.b, z5.b[1]\n"
- "sdot z26.s, z14.b, z6.b[1]\n"
- "sdot z30.s, z14.b, z7.b[1]\n"
- "sdot z19.s, z15.b, z4.b[1]\n"
- "sdot z23.s, z15.b, z5.b[1]\n"
- "sdot z27.s, z15.b, z6.b[1]\n"
- "sdot z31.s, z15.b, z7.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "sdot z16.s, z8.b, z4.b[2]\n"
- "sdot z20.s, z8.b, z5.b[2]\n"
- "sdot z24.s, z8.b, z6.b[2]\n"
- "sdot z28.s, z8.b, z7.b[2]\n"
- "sdot z17.s, z9.b, z4.b[2]\n"
- "sdot z21.s, z9.b, z5.b[2]\n"
- "sdot z25.s, z9.b, z6.b[2]\n"
- "sdot z29.s, z9.b, z7.b[2]\n"
- "sdot z18.s, z10.b, z4.b[2]\n"
- "sdot z22.s, z10.b, z5.b[2]\n"
- "sdot z26.s, z10.b, z6.b[2]\n"
- "sdot z30.s, z10.b, z7.b[2]\n"
- "sdot z19.s, z11.b, z4.b[2]\n"
- "sdot z23.s, z11.b, z5.b[2]\n"
- "sdot z27.s, z11.b, z6.b[2]\n"
- "sdot z31.s, z11.b, z7.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "sdot z16.s, z12.b, z4.b[3]\n"
- "sdot z20.s, z12.b, z5.b[3]\n"
- "sdot z24.s, z12.b, z6.b[3]\n"
- "sdot z28.s, z12.b, z7.b[3]\n"
- "sdot z17.s, z13.b, z4.b[3]\n"
- "sdot z21.s, z13.b, z5.b[3]\n"
- "sdot z25.s, z13.b, z6.b[3]\n"
- "sdot z29.s, z13.b, z7.b[3]\n"
- "sdot z18.s, z14.b, z4.b[3]\n"
- "sdot z22.s, z14.b, z5.b[3]\n"
- "sdot z26.s, z14.b, z6.b[3]\n"
- "sdot z30.s, z14.b, z7.b[3]\n"
- "sdot z19.s, z15.b, z4.b[3]\n"
- "sdot z23.s, z15.b, z5.b[3]\n"
- "sdot z27.s, z15.b, z6.b[3]\n"
- "sdot z31.s, z15.b, z7.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..1aebedb861
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<int8_t>, \
+ size_t, size_t, \
+ const int8_t *, \
+ IndirectOutputArg<int32_t>, \
+ const int32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_s8s32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_s8s32_dot_6x4VL
+{
+public:
+ typedef int8_t operand_type;
+ typedef int32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<int32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_s8s32_dot_6x4VL;
+
+ cls_sve_hybrid_s8s32_dot_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..cae9bf329f
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_s8s32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_s8s32_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, size_t N, const int8_t *B_ptr, IndirectOutputArg<int32_t> output_arg,
+ const int32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const int8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 61f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 49f\n"
+ "beq 37f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 25f\n"
+ "beq 13f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 4f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "5:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "6:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 8f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 6b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "12:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 3b\n"
+ "b 74f\n"
+ "13:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 14f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 15f\n"
+ "14:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "15:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 16f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 17f\n"
+ "16:" // Height 2: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "17:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "18:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 19f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 20f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 20f\n"
+ "19:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "20:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "ble 22f\n"
+ "21:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "bgt 21b\n"
+ "22:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "23:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 18b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "24:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 15b\n"
+ "b 74f\n"
+ "25:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 26f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 27f\n"
+ "26:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "27:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 28f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 29f\n"
+ "28:" // Height 3: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "29:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "30:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 31f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 32f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 32f\n"
+ "31:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "32:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "ble 34f\n"
+ "33:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "bgt 33b\n"
+ "34:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "35:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 30b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "36:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 27b\n"
+ "b 74f\n"
+ "37:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 38f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 39f\n"
+ "38:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "39:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 40f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 41f\n"
+ "40:" // Height 4: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "41:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "42:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 43f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 44f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 44f\n"
+ "43:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "44:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "ble 46f\n"
+ "45:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "bgt 45b\n"
+ "46:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "47:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 42b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "48:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 39b\n"
+ "b 74f\n"
+ "49:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 50f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "51:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 52f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 53f\n"
+ "52:" // Height 5: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "53:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "54:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 55f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 56f\n"
+ "55:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "56:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "ble 58f\n"
+ "57:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "bgt 57b\n"
+ "58:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "59:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 54b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "60:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 51b\n"
+ "b 74f\n"
+ "61:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 62f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 63f\n"
+ "62:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "63:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 64f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 65f\n"
+ "64:" // Height 6: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "65:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "66:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 67f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 68f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 68f\n"
+ "67:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "68:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "ble 70f\n"
+ "69:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "bgt 69b\n"
+ "70:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "sdot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "sdot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "sdot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "sdot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "sdot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "sdot z20.s, z6.b, z3.b[0]\n"
+ "add x20, x20, #0x10\n"
+ "sdot z17.s, z7.b, z2.b[0]\n"
+ "sdot z24.s, z6.b, z4.b[0]\n"
+ "sdot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z21.s, z7.b, z3.b[0]\n"
+ "sdot z25.s, z7.b, z4.b[0]\n"
+ "sdot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[0]\n"
+ "sdot z14.s, z6.b, z1.b[0]\n"
+ "sdot z18.s, z6.b, z2.b[0]\n"
+ "sdot z22.s, z6.b, z3.b[0]\n"
+ "sdot z26.s, z6.b, z4.b[0]\n"
+ "sdot z30.s, z6.b, z5.b[0]\n"
+ "sdot z11.s, z7.b, z0.b[0]\n"
+ "sdot z15.s, z7.b, z1.b[0]\n"
+ "sdot z19.s, z7.b, z2.b[0]\n"
+ "sdot z23.s, z7.b, z3.b[0]\n"
+ "sdot z27.s, z7.b, z4.b[0]\n"
+ "sdot z31.s, z7.b, z5.b[0]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[1]\n"
+ "sdot z16.s, z6.b, z2.b[1]\n"
+ "sdot z20.s, z6.b, z3.b[1]\n"
+ "sdot z24.s, z6.b, z4.b[1]\n"
+ "sdot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[1]\n"
+ "sdot z13.s, z7.b, z1.b[1]\n"
+ "sdot z17.s, z7.b, z2.b[1]\n"
+ "sdot z21.s, z7.b, z3.b[1]\n"
+ "sdot z25.s, z7.b, z4.b[1]\n"
+ "sdot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[1]\n"
+ "sdot z14.s, z6.b, z1.b[1]\n"
+ "sdot z18.s, z6.b, z2.b[1]\n"
+ "sdot z22.s, z6.b, z3.b[1]\n"
+ "sdot z26.s, z6.b, z4.b[1]\n"
+ "sdot z30.s, z6.b, z5.b[1]\n"
+ "sdot z11.s, z7.b, z0.b[1]\n"
+ "sdot z15.s, z7.b, z1.b[1]\n"
+ "sdot z19.s, z7.b, z2.b[1]\n"
+ "sdot z23.s, z7.b, z3.b[1]\n"
+ "sdot z27.s, z7.b, z4.b[1]\n"
+ "sdot z31.s, z7.b, z5.b[1]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "sdot z12.s, z6.b, z1.b[2]\n"
+ "sdot z16.s, z6.b, z2.b[2]\n"
+ "sdot z20.s, z6.b, z3.b[2]\n"
+ "sdot z24.s, z6.b, z4.b[2]\n"
+ "sdot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[2]\n"
+ "sdot z13.s, z7.b, z1.b[2]\n"
+ "sdot z17.s, z7.b, z2.b[2]\n"
+ "sdot z21.s, z7.b, z3.b[2]\n"
+ "sdot z25.s, z7.b, z4.b[2]\n"
+ "sdot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[2]\n"
+ "sdot z14.s, z6.b, z1.b[2]\n"
+ "sdot z18.s, z6.b, z2.b[2]\n"
+ "sdot z22.s, z6.b, z3.b[2]\n"
+ "sdot z26.s, z6.b, z4.b[2]\n"
+ "sdot z30.s, z6.b, z5.b[2]\n"
+ "sdot z11.s, z7.b, z0.b[2]\n"
+ "sdot z15.s, z7.b, z1.b[2]\n"
+ "sdot z19.s, z7.b, z2.b[2]\n"
+ "sdot z23.s, z7.b, z3.b[2]\n"
+ "sdot z27.s, z7.b, z4.b[2]\n"
+ "sdot z31.s, z7.b, z5.b[2]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "sdot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sdot z12.s, z6.b, z1.b[3]\n"
+ "sdot z16.s, z6.b, z2.b[3]\n"
+ "sdot z20.s, z6.b, z3.b[3]\n"
+ "sdot z24.s, z6.b, z4.b[3]\n"
+ "sdot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "sdot z9.s, z7.b, z0.b[3]\n"
+ "sdot z13.s, z7.b, z1.b[3]\n"
+ "sdot z17.s, z7.b, z2.b[3]\n"
+ "sdot z21.s, z7.b, z3.b[3]\n"
+ "sdot z25.s, z7.b, z4.b[3]\n"
+ "sdot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "sdot z10.s, z6.b, z0.b[3]\n"
+ "sdot z14.s, z6.b, z1.b[3]\n"
+ "sdot z18.s, z6.b, z2.b[3]\n"
+ "sdot z22.s, z6.b, z3.b[3]\n"
+ "sdot z26.s, z6.b, z4.b[3]\n"
+ "sdot z30.s, z6.b, z5.b[3]\n"
+ "sdot z11.s, z7.b, z0.b[3]\n"
+ "sdot z15.s, z7.b, z1.b[3]\n"
+ "sdot z19.s, z7.b, z2.b[3]\n"
+ "sdot z23.s, z7.b, z3.b[3]\n"
+ "sdot z27.s, z7.b, z4.b[3]\n"
+ "sdot z31.s, z7.b, z5.b[3]\n"
+ "71:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 66b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "72:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 63b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 74f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "73:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "74:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
index c325e522d7..964f7cc2c1 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL.hpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -10,37 +10,43 @@
* sell copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
*/
#pragma once
-
#ifdef __ARM_FEATURE_SVE
-#include <cstdint>
#include "../std_transforms_sve.hpp"
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint8_t>, \
+ const Requantize32 *, const int32_t *, unsigned int
+
namespace arm_gemm
{
// Actual kernel implementations
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_hybrid_u8qa_dot_4x4VL( ARGLIST );
-class hybrid_u8u32_dot_4VLx4
+class cls_sve_hybrid_u8qa_dot_4x4VL
{
public:
typedef uint8_t operand_type;
- typedef uint32_t result_type;
+ typedef uint8_t result_type;
- typedef void (*kern_type)(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+ typedef void (*kern_type)( ARGLIST );
/* Kernel blocking parameters */
static constexpr unsigned int out_height()
@@ -60,30 +66,20 @@ public:
static constexpr bool supports_accumulate()
{
- return true;
- }
-
- static constexpr bool supports_bias()
- {
- return false;
- }
-
- static constexpr bool supports_activation()
- {
return false;
}
StdTransformsSVE<operand_type, result_type, 4, 4, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_hybrid_u8u32_dot_4VLx4;
+ kern_type kernel=sve_hybrid_u8qa_dot_4x4VL;
- hybrid_u8u32_dot_4VLx4(const CPUInfo *)
+ cls_sve_hybrid_u8qa_dot_4x4VL(const CPUInfo *)
{
-
}
};
} // namespace arm_gemm
+#undef ARGLIST
#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
new file mode 100644
index 0000000000..0a6546b78a
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8qa_dot_4x4VL/generic.cpp
@@ -0,0 +1,1602 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+#include <cstdint>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8qa_dot_4x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint8_t> output_arg,
+ const Requantize32 *qp, const int32_t *col_bias, unsigned int
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ if (qp->c_offset > qp->minval) {
+ flags |= 0x20;
+ }
+ __asm__ __volatile__(
+ "ptrue p2.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x4\n"
+ "bge 46f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 31f\n"
+ "beq 16f\n"
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "add x9, x9, x19\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "4:" // Height 1: setup done
+ "mov x28, #0x0\n"
+ "5:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 6f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "cbnz x28, 7f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 7f\n"
+ "6:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "7:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "ble 10f\n"
+ "8:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "tbnz %x[flags], #31, 9f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "9:" // Height 1: Multiply loop: unique 1: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x10\n"
+ "bgt 8b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 12f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "12:" // Height 1: Multiply loop: unique 2: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 5b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "tbnz %x[flags], #31, 13f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z1.s }, p2/Z, [x19]\n"
+ "neg z1.s, p2/M, z1.s\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d11, p0, z11.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z1.s\n"
+ "13:" // Height 1: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z16.s, z16.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z17.s, z17.s, z1.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ "tbz %x[flags], #5, 14f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "14:" // Height 1: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "addvl x9, x9, #1\n"
+ "15:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 3b\n"
+ "b 62f\n"
+ "16:" // Height 2
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 17f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "add x25, x25, x19\n"
+ "b 18f\n"
+ "17:" // Height 2: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "18:" // Height 2: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "19:" // Height 2: setup done
+ "mov x28, #0x0\n"
+ "20:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 21f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "cbnz x28, 22f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 22f\n"
+ "21:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "22:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "ble 25f\n"
+ "23:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "tbnz %x[flags], #31, 24f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "24:" // Height 2: Multiply loop: unique 3: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "bgt 23b\n"
+ "25:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z7.b, z1.b[0]\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z22.s, z8.b, z1.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "udot z23.s, z9.b, z1.b[0]\n"
+ "ble 26f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "udot z22.s, z5.b, z1.b[1]\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "udot z23.s, z6.b, z1.b[1]\n"
+ "ble 26f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z8.b, z1.b[2]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z9.b, z1.b[2]\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "udot z23.s, z10.b, z1.b[2]\n"
+ "ble 26f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "udot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z5.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z22.s, z6.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "udot z23.s, z7.b, z1.b[3]\n"
+ "26:" // Height 2: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 27f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "27:" // Height 2: Multiply loop: unique 4: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 20b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "tbnz %x[flags], #31, 28f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z2.s }, p2/Z, [x19]\n"
+ "neg z2.s, p2/M, z2.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "uaddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d12, p0, z12.s\n"
+ "mov z11.s, z11.s[0]\n"
+ "mov z12.s, z12.s[0]\n"
+ "mul z11.s, p2/M, z11.s, z2.s\n"
+ "mul z12.s, p2/M, z12.s, z2.s\n"
+ "28:" // Height 2: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ "tbz %x[flags], #5, 29f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "29:" // Height 2: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "addvl x25, x25, #1\n"
+ "30:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 18b\n"
+ "b 62f\n"
+ "31:" // Height 3
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 32f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "add x25, x25, x19\n"
+ "add x23, x23, x19\n"
+ "b 33f\n"
+ "32:" // Height 3: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "33:" // Height 3: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "34:" // Height 3: setup done
+ "mov x28, #0x0\n"
+ "35:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 36f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "cbnz x28, 37f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 37f\n"
+ "36:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "37:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "ble 40f\n"
+ "38:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z26.s, z6.b, z2.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "udot z27.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "udot z24.s, z8.b, z2.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z25.s, z9.b, z2.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "udot z26.s, z10.b, z2.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "udot z27.s, z4.b, z2.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "udot z24.s, z5.b, z2.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z25.s, z6.b, z2.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z27.s, z8.b, z2.b[2]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "udot z24.s, z9.b, z2.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z25.s, z10.b, z2.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z26.s, z4.b, z2.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z27.s, z5.b, z2.b[3]\n"
+ "tbnz %x[flags], #31, 39f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "39:" // Height 3: Multiply loop: unique 5: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "bgt 38b\n"
+ "40:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z6.b, z1.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z21.s, z7.b, z1.b[0]\n"
+ "udot z25.s, z7.b, z2.b[0]\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z22.s, z8.b, z1.b[0]\n"
+ "udot z26.s, z8.b, z2.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "udot z23.s, z9.b, z1.b[0]\n"
+ "udot z27.s, z9.b, z2.b[0]\n"
+ "ble 41f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z25.s, z4.b, z2.b[1]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "udot z22.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z2.b[1]\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "udot z23.s, z6.b, z1.b[1]\n"
+ "udot z27.s, z6.b, z2.b[1]\n"
+ "ble 41f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "udot z21.s, z8.b, z1.b[2]\n"
+ "udot z25.s, z8.b, z2.b[2]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z9.b, z1.b[2]\n"
+ "udot z26.s, z9.b, z2.b[2]\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "udot z23.s, z10.b, z1.b[2]\n"
+ "udot z27.s, z10.b, z2.b[2]\n"
+ "ble 41f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "udot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "udot z21.s, z5.b, z1.b[3]\n"
+ "udot z25.s, z5.b, z2.b[3]\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z22.s, z6.b, z1.b[3]\n"
+ "udot z26.s, z6.b, z2.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "udot z23.s, z7.b, z1.b[3]\n"
+ "udot z27.s, z7.b, z2.b[3]\n"
+ "41:" // Height 3: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 42f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "42:" // Height 3: Multiply loop: unique 6: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 35b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "tbnz %x[flags], #31, 43f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z3.s }, p2/Z, [x19]\n"
+ "neg z3.s, p2/M, z3.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "uaddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov z12.s, z12.s[0]\n"
+ "uaddv d13, p0, z13.s\n"
+ "mul z11.s, p2/M, z11.s, z3.s\n"
+ "mul z12.s, p2/M, z12.s, z3.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z3.s\n"
+ "43:" // Height 3: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ "tbz %x[flags], #5, 44f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "44:" // Height 3: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "addvl x23, x23, #1\n"
+ "45:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 33b\n"
+ "b 62f\n"
+ "46:" // Height 4
+ "mov z11.s, #0x0\n"
+ "ldr x12, [%x[args_ptr], %[offsetof_N]]\n"
+ "mov x10, %x[col_bias]\n"
+ "mov z12.s, #0x0\n"
+ "ldr x11, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "bic %x[flags], %x[flags], #0x80000000\n"
+ "mov z13.s, #0x0\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.b, #0x1\n"
+ "tbz %x[flags], #2, 47f\n"
+ "ldr x9, [%x[output_ptr], #0x0]\n"
+ "ldr x25, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19\n"
+ "ldr x23, [%x[output_ptr], #0x10]\n"
+ "ldr x21, [%x[output_ptr], #0x18]\n"
+ "add x25, x25, x19\n"
+ "add %x[output_ptr], %x[output_ptr], #0x20\n"
+ "add x23, x23, x19\n"
+ "add x21, x21, x19\n"
+ "b 48f\n"
+ "47:" // Height 4: setup direct output
+ "mov x9, %x[output_ptr]\n"
+ "add x25, x9, x19\n"
+ "add x23, x25, x19\n"
+ "add x21, x23, x19\n"
+ "add %x[output_ptr], x21, x19\n"
+ "48:" // Height 4: Column loop
+ "mov z16.s, #0x0\n"
+ "mov x19, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "whilelt p1.b, x19, x12\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "49:" // Height 4: setup done
+ "mov x28, #0x0\n"
+ "50:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w27, [x20, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 51f\n"
+ "ldr x20, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x26, [x20, #0x0]\n"
+ "ldr x24, [x20, #0x8]\n"
+ "ldr x22, [x20, #0x10]\n"
+ "ldr x20, [x20, #0x18]\n"
+ "cbnz x28, 52f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 52f\n"
+ "51:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "52:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "ble 55f\n"
+ "53:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z4.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z5.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z4.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z4.b, z2.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "udot z21.s, z5.b, z1.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "udot z25.s, z5.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #4, MUL VL]\n"
+ "udot z28.s, z4.b, z3.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #5, MUL VL]\n"
+ "udot z29.s, z5.b, z3.b[0]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #6, MUL VL]\n"
+ "udot z18.s, z6.b, z0.b[0]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #7, MUL VL]\n"
+ "addvl x11, x11, #16\n"
+ "udot z22.s, z6.b, z1.b[0]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-8, MUL VL]\n"
+ "udot z26.s, z6.b, z2.b[0]\n"
+ "udot z30.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #-7, MUL VL]\n"
+ "udot z19.s, z7.b, z0.b[0]\n"
+ "udot z23.s, z7.b, z1.b[0]\n"
+ "udot z27.s, z7.b, z2.b[0]\n"
+ "udot z31.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #-6, MUL VL]\n"
+ "udot z16.s, z8.b, z0.b[1]\n"
+ "udot z20.s, z8.b, z1.b[1]\n"
+ "udot z24.s, z8.b, z2.b[1]\n"
+ "udot z28.s, z8.b, z3.b[1]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #-5, MUL VL]\n"
+ "udot z17.s, z9.b, z0.b[1]\n"
+ "udot z21.s, z9.b, z1.b[1]\n"
+ "udot z25.s, z9.b, z2.b[1]\n"
+ "udot z29.s, z9.b, z3.b[1]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #-4, MUL VL]\n"
+ "udot z18.s, z10.b, z0.b[1]\n"
+ "udot z22.s, z10.b, z1.b[1]\n"
+ "udot z26.s, z10.b, z2.b[1]\n"
+ "udot z30.s, z10.b, z3.b[1]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #-3, MUL VL]\n"
+ "udot z19.s, z4.b, z0.b[1]\n"
+ "udot z23.s, z4.b, z1.b[1]\n"
+ "udot z27.s, z4.b, z2.b[1]\n"
+ "udot z31.s, z4.b, z3.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #-2, MUL VL]\n"
+ "udot z16.s, z5.b, z0.b[2]\n"
+ "udot z20.s, z5.b, z1.b[2]\n"
+ "udot z24.s, z5.b, z2.b[2]\n"
+ "udot z28.s, z5.b, z3.b[2]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #-1, MUL VL]\n"
+ "udot z17.s, z6.b, z0.b[2]\n"
+ "udot z21.s, z6.b, z1.b[2]\n"
+ "udot z25.s, z6.b, z2.b[2]\n"
+ "udot z29.s, z6.b, z3.b[2]\n"
+ "udot z18.s, z7.b, z0.b[2]\n"
+ "udot z22.s, z7.b, z1.b[2]\n"
+ "udot z26.s, z7.b, z2.b[2]\n"
+ "udot z30.s, z7.b, z3.b[2]\n"
+ "udot z19.s, z8.b, z0.b[2]\n"
+ "udot z23.s, z8.b, z1.b[2]\n"
+ "udot z27.s, z8.b, z2.b[2]\n"
+ "udot z31.s, z8.b, z3.b[2]\n"
+ "udot z16.s, z9.b, z0.b[3]\n"
+ "udot z20.s, z9.b, z1.b[3]\n"
+ "udot z24.s, z9.b, z2.b[3]\n"
+ "udot z28.s, z9.b, z3.b[3]\n"
+ "udot z17.s, z10.b, z0.b[3]\n"
+ "udot z21.s, z10.b, z1.b[3]\n"
+ "udot z25.s, z10.b, z2.b[3]\n"
+ "udot z29.s, z10.b, z3.b[3]\n"
+ "udot z18.s, z4.b, z0.b[3]\n"
+ "udot z22.s, z4.b, z1.b[3]\n"
+ "udot z26.s, z4.b, z2.b[3]\n"
+ "udot z30.s, z4.b, z3.b[3]\n"
+ "udot z19.s, z5.b, z0.b[3]\n"
+ "udot z23.s, z5.b, z1.b[3]\n"
+ "udot z27.s, z5.b, z2.b[3]\n"
+ "udot z31.s, z5.b, z3.b[3]\n"
+ "tbnz %x[flags], #31, 54f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "udot z14.s, z3.b, z15.b\n"
+ "54:" // Height 4: Multiply loop: unique 7: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "sub x27, x27, #0x10\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "cmp x27, #0x10\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "bgt 53b\n"
+ "55:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p2/Z, [x11]\n"
+ "whilelt p0.b, XZR, x27\n"
+ "ld1b { z7.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x26]\n"
+ "udot z16.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z17.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z20.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z24.s, z6.b, z2.b[0]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "add x20, x20, #0x10\n"
+ "udot z21.s, z7.b, z1.b[0]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z6.b, z3.b[0]\n"
+ "udot z25.s, z7.b, z2.b[0]\n"
+ "udot z29.s, z7.b, z3.b[0]\n"
+ "udot z18.s, z8.b, z0.b[0]\n"
+ "udot z22.s, z8.b, z1.b[0]\n"
+ "udot z26.s, z8.b, z2.b[0]\n"
+ "udot z30.s, z8.b, z3.b[0]\n"
+ "udot z19.s, z9.b, z0.b[0]\n"
+ "udot z23.s, z9.b, z1.b[0]\n"
+ "udot z27.s, z9.b, z2.b[0]\n"
+ "udot z31.s, z9.b, z3.b[0]\n"
+ "ble 56f\n"
+ "ld1b { z10.b }, p2/Z, [x11]\n"
+ "udot z16.s, z10.b, z0.b[1]\n"
+ "ld1b { z4.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z10.b, z1.b[1]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z10.b, z2.b[1]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z10.b, z3.b[1]\n"
+ "udot z17.s, z4.b, z0.b[1]\n"
+ "udot z21.s, z4.b, z1.b[1]\n"
+ "udot z25.s, z4.b, z2.b[1]\n"
+ "udot z29.s, z4.b, z3.b[1]\n"
+ "udot z18.s, z5.b, z0.b[1]\n"
+ "udot z22.s, z5.b, z1.b[1]\n"
+ "udot z26.s, z5.b, z2.b[1]\n"
+ "udot z30.s, z5.b, z3.b[1]\n"
+ "udot z19.s, z6.b, z0.b[1]\n"
+ "udot z23.s, z6.b, z1.b[1]\n"
+ "udot z27.s, z6.b, z2.b[1]\n"
+ "udot z31.s, z6.b, z3.b[1]\n"
+ "ble 56f\n"
+ "ld1b { z7.b }, p2/Z, [x11]\n"
+ "udot z16.s, z7.b, z0.b[2]\n"
+ "ld1b { z8.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "subs x27, x27, #0x4\n"
+ "udot z20.s, z7.b, z1.b[2]\n"
+ "ld1b { z9.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z7.b, z2.b[2]\n"
+ "ld1b { z10.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z7.b, z3.b[2]\n"
+ "udot z17.s, z8.b, z0.b[2]\n"
+ "udot z21.s, z8.b, z1.b[2]\n"
+ "udot z25.s, z8.b, z2.b[2]\n"
+ "udot z29.s, z8.b, z3.b[2]\n"
+ "udot z18.s, z9.b, z0.b[2]\n"
+ "udot z22.s, z9.b, z1.b[2]\n"
+ "udot z26.s, z9.b, z2.b[2]\n"
+ "udot z30.s, z9.b, z3.b[2]\n"
+ "udot z19.s, z10.b, z0.b[2]\n"
+ "udot z23.s, z10.b, z1.b[2]\n"
+ "udot z27.s, z10.b, z2.b[2]\n"
+ "udot z31.s, z10.b, z3.b[2]\n"
+ "ble 56f\n"
+ "ld1b { z4.b }, p2/Z, [x11]\n"
+ "udot z16.s, z4.b, z0.b[3]\n"
+ "ld1b { z5.b }, p2/Z, [x11, #1, MUL VL]\n"
+ "udot z20.s, z4.b, z1.b[3]\n"
+ "ld1b { z6.b }, p2/Z, [x11, #2, MUL VL]\n"
+ "udot z24.s, z4.b, z2.b[3]\n"
+ "ld1b { z7.b }, p2/Z, [x11, #3, MUL VL]\n"
+ "addvl x11, x11, #4\n"
+ "udot z28.s, z4.b, z3.b[3]\n"
+ "udot z17.s, z5.b, z0.b[3]\n"
+ "udot z21.s, z5.b, z1.b[3]\n"
+ "udot z25.s, z5.b, z2.b[3]\n"
+ "udot z29.s, z5.b, z3.b[3]\n"
+ "udot z18.s, z6.b, z0.b[3]\n"
+ "udot z22.s, z6.b, z1.b[3]\n"
+ "udot z26.s, z6.b, z2.b[3]\n"
+ "udot z30.s, z6.b, z3.b[3]\n"
+ "udot z19.s, z7.b, z0.b[3]\n"
+ "udot z23.s, z7.b, z1.b[3]\n"
+ "udot z27.s, z7.b, z2.b[3]\n"
+ "udot z31.s, z7.b, z3.b[3]\n"
+ "56:" // Height 4: Multiply loop: multiply skip
+ "tbnz %x[flags], #31, 57f\n"
+ "udot z11.s, z0.b, z15.b\n"
+ "udot z12.s, z1.b, z15.b\n"
+ "udot z13.s, z2.b, z15.b\n"
+ "udot z14.s, z3.b, z15.b\n"
+ "57:" // Height 4: Multiply loop: unique 8: skip row sum
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "add x28, x28, #0x1\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x28, x19\n"
+ "bne 50b\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "tbnz %x[flags], #31, 58f\n"
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "neg z4.s, p2/M, z4.s\n"
+ "mov x20, #0x4\n"
+ "mov x19, #0x4\n"
+ "whilelt p0.s, XZR, x20\n"
+ "uaddv d11, p0, z11.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "uaddv d12, p0, z12.s\n"
+ "mov x19, #0x4\n"
+ "mov z11.s, z11.s[0]\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mov x19, #0x4\n"
+ "mov z12.s, z12.s[0]\n"
+ "uaddv d13, p0, z13.s\n"
+ "whilelt p0.s, XZR, x19\n"
+ "mul z11.s, p2/M, z11.s, z4.s\n"
+ "uaddv d14, p0, z14.s\n"
+ "mul z12.s, p2/M, z12.s, z4.s\n"
+ "mov z13.s, z13.s[0]\n"
+ "mul z13.s, p2/M, z13.s, z4.s\n"
+ "mov z14.s, z14.s[0]\n"
+ "mul z14.s, p2/M, z14.s, z4.s\n"
+ "58:" // Height 4: skip row sum fixup
+ "add z16.s, z16.s, z11.s\n"
+ "ld1w { z0.s }, p2/Z, [x10]\n"
+ "orr %x[flags], %x[flags], #0x80000000\n"
+ "add z17.s, z17.s, z11.s\n"
+ "ld1w { z1.s }, p2/Z, [x10, #1, MUL VL]\n"
+ "add x20, %x[qp], %[per_layer_right_shift]\n"
+ "add z18.s, z18.s, z11.s\n"
+ "ld1w { z2.s }, p2/Z, [x10, #2, MUL VL]\n"
+ "add x19, %x[qp], %[per_layer_mul]\n"
+ "add z19.s, z19.s, z11.s\n"
+ "ld1w { z3.s }, p2/Z, [x10, #3, MUL VL]\n"
+ "addvl x10, x10, #4\n"
+ "add z20.s, z20.s, z12.s\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ "add z21.s, z21.s, z12.s\n"
+ "add z22.s, z22.s, z12.s\n"
+ "add z23.s, z23.s, z12.s\n"
+ "add z24.s, z24.s, z13.s\n"
+ "add z25.s, z25.s, z13.s\n"
+ "add z26.s, z26.s, z13.s\n"
+ "add z27.s, z27.s, z13.s\n"
+ "add z28.s, z28.s, z14.s\n"
+ "add z29.s, z29.s, z14.s\n"
+ "add z30.s, z30.s, z14.s\n"
+ "add z31.s, z31.s, z14.s\n"
+ "add z16.s, z16.s, z0.s\n"
+ "add z17.s, z17.s, z1.s\n"
+ "add z18.s, z18.s, z2.s\n"
+ "add z19.s, z19.s, z3.s\n"
+ "add z20.s, z20.s, z0.s\n"
+ "add z21.s, z21.s, z1.s\n"
+ "add z22.s, z22.s, z2.s\n"
+ "add z23.s, z23.s, z3.s\n"
+ "add z24.s, z24.s, z0.s\n"
+ "add z25.s, z25.s, z1.s\n"
+ "add z26.s, z26.s, z2.s\n"
+ "add z27.s, z27.s, z3.s\n"
+ "add z28.s, z28.s, z0.s\n"
+ "ld1rw { z0.s }, p2/Z, [x20]\n"
+ "add z29.s, z29.s, z1.s\n"
+ "add z30.s, z30.s, z2.s\n"
+ "add z31.s, z31.s, z3.s\n"
+ ".inst 0x04a47610 // sqrdmulh z16.s, z16.s, z4.s\n"
+ ".inst 0x04a47631 // sqrdmulh z17.s, z17.s, z4.s\n"
+ ".inst 0x04a47652 // sqrdmulh z18.s, z18.s, z4.s\n"
+ ".inst 0x04a47673 // sqrdmulh z19.s, z19.s, z4.s\n"
+ ".inst 0x04a47694 // sqrdmulh z20.s, z20.s, z4.s\n"
+ ".inst 0x04a476b5 // sqrdmulh z21.s, z21.s, z4.s\n"
+ ".inst 0x04a476d6 // sqrdmulh z22.s, z22.s, z4.s\n"
+ ".inst 0x04a476f7 // sqrdmulh z23.s, z23.s, z4.s\n"
+ ".inst 0x04a47718 // sqrdmulh z24.s, z24.s, z4.s\n"
+ ".inst 0x04a47739 // sqrdmulh z25.s, z25.s, z4.s\n"
+ ".inst 0x04a4775a // sqrdmulh z26.s, z26.s, z4.s\n"
+ ".inst 0x04a4777b // sqrdmulh z27.s, z27.s, z4.s\n"
+ ".inst 0x04a4779c // sqrdmulh z28.s, z28.s, z4.s\n"
+ ".inst 0x04a477bd // sqrdmulh z29.s, z29.s, z4.s\n"
+ ".inst 0x04a477de // sqrdmulh z30.s, z30.s, z4.s\n"
+ ".inst 0x04a477ff // sqrdmulh z31.s, z31.s, z4.s\n"
+ "tbz %x[flags], #5, 59f\n"
+ "and z4.d, z16.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z17.d, z0.d\n"
+ "and z6.d, z18.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "and z7.d, z19.d, z0.d\n"
+ "and z8.d, z20.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "and z9.d, z21.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "sqadd z16.s, z16.s, z4.s\n"
+ "and z10.d, z22.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "and z4.d, z23.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "sqadd z17.s, z17.s, z5.s\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z18.s, z18.s, z6.s\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "and z5.d, z24.d, z0.d\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z19.s, z19.s, z7.s\n"
+ "sqadd z20.s, z20.s, z8.s\n"
+ "sqadd z21.s, z21.s, z9.s\n"
+ "sqadd z22.s, z22.s, z10.s\n"
+ "sqadd z23.s, z23.s, z4.s\n"
+ "and z6.d, z25.d, z0.d\n"
+ "asr z6.s, z6.s, #0x1f\n"
+ "sqadd z24.s, z24.s, z5.s\n"
+ "and z7.d, z26.d, z0.d\n"
+ "asr z7.s, z7.s, #0x1f\n"
+ "and z8.d, z27.d, z0.d\n"
+ "and z9.d, z28.d, z0.d\n"
+ "asr z8.s, z8.s, #0x1f\n"
+ "sqadd z25.s, z25.s, z6.s\n"
+ "and z10.d, z29.d, z0.d\n"
+ "asr z9.s, z9.s, #0x1f\n"
+ "and z4.d, z30.d, z0.d\n"
+ "asr z10.s, z10.s, #0x1f\n"
+ "sqadd z26.s, z26.s, z7.s\n"
+ "and z5.d, z31.d, z0.d\n"
+ "asr z4.s, z4.s, #0x1f\n"
+ "sqadd z27.s, z27.s, z8.s\n"
+ "asr z5.s, z5.s, #0x1f\n"
+ "sqadd z28.s, z28.s, z9.s\n"
+ "sqadd z29.s, z29.s, z10.s\n"
+ "sqadd z30.s, z30.s, z4.s\n"
+ "sqadd z31.s, z31.s, z5.s\n"
+ "59:" // Height 4: no shift correction
+ ".inst 0x44828810 // srshl z16.s, p2/M, z16.s, z0.s\n"
+ "add x19, %x[qp], %[c_offset]\n"
+ "ld1rw { z4.s }, p2/Z, [x19]\n"
+ ".inst 0x44828811 // srshl z17.s, p2/M, z17.s, z0.s\n"
+ "add x19, %x[qp], %[minval]\n"
+ ".inst 0x44828812 // srshl z18.s, p2/M, z18.s, z0.s\n"
+ "ld1rw { z5.s }, p2/Z, [x19]\n"
+ "add x19, %x[qp], %[maxval]\n"
+ ".inst 0x44828813 // srshl z19.s, p2/M, z19.s, z0.s\n"
+ "ld1rw { z6.s }, p2/Z, [x19]\n"
+ ".inst 0x44828814 // srshl z20.s, p2/M, z20.s, z0.s\n"
+ "add z16.s, z16.s, z4.s\n"
+ "add z17.s, z17.s, z4.s\n"
+ "add z18.s, z18.s, z4.s\n"
+ "add z19.s, z19.s, z4.s\n"
+ "add z20.s, z20.s, z4.s\n"
+ "smin z16.s, p2/M, z16.s, z6.s\n"
+ "smin z17.s, p2/M, z17.s, z6.s\n"
+ "smin z18.s, p2/M, z18.s, z6.s\n"
+ "smin z19.s, p2/M, z19.s, z6.s\n"
+ "smax z16.s, p2/M, z16.s, z5.s\n"
+ "smax z17.s, p2/M, z17.s, z5.s\n"
+ "smax z18.s, p2/M, z18.s, z5.s\n"
+ "smax z19.s, p2/M, z19.s, z5.s\n"
+ "smin z20.s, p2/M, z20.s, z6.s\n"
+ "uzp1 z16.h, z16.h, z17.h\n"
+ ".inst 0x44828815 // srshl z21.s, p2/M, z21.s, z0.s\n"
+ "uzp1 z17.h, z18.h, z19.h\n"
+ "smax z20.s, p2/M, z20.s, z5.s\n"
+ "uzp1 z16.b, z16.b, z17.b\n"
+ "st1b { z16.b }, p1, [x9]\n"
+ "add z21.s, z21.s, z4.s\n"
+ "addvl x9, x9, #1\n"
+ ".inst 0x44828816 // srshl z22.s, p2/M, z22.s, z0.s\n"
+ ".inst 0x44828817 // srshl z23.s, p2/M, z23.s, z0.s\n"
+ ".inst 0x44828818 // srshl z24.s, p2/M, z24.s, z0.s\n"
+ "smin z21.s, p2/M, z21.s, z6.s\n"
+ ".inst 0x44828819 // srshl z25.s, p2/M, z25.s, z0.s\n"
+ "add z22.s, z22.s, z4.s\n"
+ "add z23.s, z23.s, z4.s\n"
+ "add z24.s, z24.s, z4.s\n"
+ "add z25.s, z25.s, z4.s\n"
+ "smax z21.s, p2/M, z21.s, z5.s\n"
+ "smin z22.s, p2/M, z22.s, z6.s\n"
+ "smin z23.s, p2/M, z23.s, z6.s\n"
+ "smin z24.s, p2/M, z24.s, z6.s\n"
+ "uzp1 z20.h, z20.h, z21.h\n"
+ "smax z22.s, p2/M, z22.s, z5.s\n"
+ "smax z23.s, p2/M, z23.s, z5.s\n"
+ "smax z24.s, p2/M, z24.s, z5.s\n"
+ "smin z25.s, p2/M, z25.s, z6.s\n"
+ ".inst 0x4482881a // srshl z26.s, p2/M, z26.s, z0.s\n"
+ "uzp1 z21.h, z22.h, z23.h\n"
+ ".inst 0x4482881b // srshl z27.s, p2/M, z27.s, z0.s\n"
+ "uzp1 z20.b, z20.b, z21.b\n"
+ "st1b { z20.b }, p1, [x25]\n"
+ "add z26.s, z26.s, z4.s\n"
+ "addvl x25, x25, #1\n"
+ "add z27.s, z27.s, z4.s\n"
+ "smax z25.s, p2/M, z25.s, z5.s\n"
+ ".inst 0x4482881c // srshl z28.s, p2/M, z28.s, z0.s\n"
+ "smin z26.s, p2/M, z26.s, z6.s\n"
+ "smin z27.s, p2/M, z27.s, z6.s\n"
+ "uzp1 z24.h, z24.h, z25.h\n"
+ "add z28.s, z28.s, z4.s\n"
+ "smax z26.s, p2/M, z26.s, z5.s\n"
+ "smax z27.s, p2/M, z27.s, z5.s\n"
+ "smin z28.s, p2/M, z28.s, z6.s\n"
+ ".inst 0x4482881d // srshl z29.s, p2/M, z29.s, z0.s\n"
+ ".inst 0x4482881e // srshl z30.s, p2/M, z30.s, z0.s\n"
+ "uzp1 z25.h, z26.h, z27.h\n"
+ "smax z28.s, p2/M, z28.s, z5.s\n"
+ "add z29.s, z29.s, z4.s\n"
+ "add z30.s, z30.s, z4.s\n"
+ "uzp1 z24.b, z24.b, z25.b\n"
+ "st1b { z24.b }, p1, [x23]\n"
+ "smin z29.s, p2/M, z29.s, z6.s\n"
+ "addvl x23, x23, #1\n"
+ "smin z30.s, p2/M, z30.s, z6.s\n"
+ ".inst 0x4482881f // srshl z31.s, p2/M, z31.s, z0.s\n"
+ "smax z29.s, p2/M, z29.s, z5.s\n"
+ "add z31.s, z31.s, z4.s\n"
+ "smax z30.s, p2/M, z30.s, z5.s\n"
+ "uzp1 z28.h, z28.h, z29.h\n"
+ "smin z31.s, p2/M, z31.s, z6.s\n"
+ "smax z31.s, p2/M, z31.s, z5.s\n"
+ "uzp1 z29.h, z30.h, z31.h\n"
+ "uzp1 z28.b, z28.b, z29.b\n"
+ "st1b { z28.b }, p1, [x21]\n"
+ "addvl x21, x21, #1\n"
+ "60:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x12, x12, x19\n"
+ "bgt 48b\n"
+ "subs %x[M], %x[M], #0x4\n"
+ "beq 62f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 61f\n"
+ "add x20, x20, #0x4\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "61:" // Update direct input
+ "mov x19, #0x4\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "62:" // Exit
+
+ : [M] "+r" (M), [flags] "+r" (flags), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [c_offset] "I" (offsetof(Requantize32, c_offset)), [col_bias] "r" (col_bias), [maxval] "I" (offsetof(Requantize32, maxval)), [minval] "I" (offsetof(Requantize32, minval)), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [per_layer_mul] "I" (offsetof(Requantize32, per_layer_mul)), [per_layer_right_shift] "I" (offsetof(Requantize32, per_layer_right_shift)), [qp] "r" (qp)
+ : "cc", "memory", "p0", "p1", "p2", "x9", "x10", "x11", "x12", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
deleted file mode 100644
index 565832e8de..0000000000
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_4VLx4/generic.cpp
+++ /dev/null
@@ -1,2137 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifdef __ARM_FEATURE_SVE
-
-#include <algorithm>
-
-#include "arm_gemm.hpp"
-#include <cstdint>
-#include "../../asmlib.hpp"
-#include "../../utils.hpp"
-
-namespace arm_gemm {
-
-void sve_hybrid_u8u32_dot_4VLx4(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation , bool accumulate) {
- const int K_stride = ((K + 3) / 4) * 4;
- const long loops_count = ((K + 16) / 32) - 1;
- K -= loops_count * 32;
- const long regs_count = (K / 16) - 1;
- K -= (regs_count + 1) * 16;
- const long leftovers = K;
- const long blocks_count = (K + 3) / 4;
-
- int rows_to_compute;
-
- for (int y=0; y<M; y+=rows_to_compute) {
- const uint8_t * const a_ptr0_base = A + (y * lda);
- const unsigned long ldab = lda * sizeof(uint8_t);
-
- uint32_t *c_ptr0 = C + (y * ldc);
-
- rows_to_compute = M-y;
- if (rows_to_compute > 4) {
- if (rows_to_compute % 4) {
- rows_to_compute = 4 - 1;
- } else {
- rows_to_compute = 4;
- }
- }
-
- for (int x0=0; x0<N; x0+=(4 * get_vector_length<uint32_t>())) {
- const long width = std::min((unsigned long)N-x0, (4 * get_vector_length<uint32_t>()));
- long loops = loops_count;
- long regs = regs_count;
- long temp = 0;
- long blocks = blocks_count;
- const uint8_t *a_ptr0 = a_ptr0_base;
- const uint8_t *b_ptr0 = B + (K_stride * x0);
- const unsigned long ldcb = ldc * sizeof(uint32_t);
-
- switch(rows_to_compute) {
- case 1:
- __asm __volatile (
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z18.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z19.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "cc", "memory"
- );
- break;
- case 2:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "c_ptr1 .req X1\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z19.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z20.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq c_ptr1\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "cc", "memory"
- );
- break;
- case 3:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "c_ptr1 .req X2\n"
- "c_ptr2 .req X3\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z20.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z21.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z26.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z27.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "cc", "memory"
- );
- break;
- default:
- case 4:
- __asm __volatile (
- "a_ptr1 .req X0\n"
- "a_ptr2 .req X1\n"
- "a_ptr3 .req X2\n"
- "c_ptr1 .req X3\n"
- "c_ptr2 .req X4\n"
- "c_ptr3 .req X5\n"
- "add a_ptr1, %[a_ptr0], %[lda]\n"
- "add c_ptr1, %[c_ptr0], %[ldc]\n"
- "add a_ptr2, a_ptr1, %[lda]\n"
- "add c_ptr2, c_ptr1, %[ldc]\n"
- "add a_ptr3, a_ptr2, %[lda]\n"
- "add c_ptr3, c_ptr2, %[ldc]\n"
- "whilelt p6.b, %[temp], %[leftovers]\n"
- "whilelt p0.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "ptrue p7.b\n"
- "whilelt p1.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p2.s, %[temp], %[width]\n"
- "incw %[temp], all, mul #1\n"
- "whilelt p3.s, %[temp], %[width]\n"
- "cbnz %[accumulate], 1f\n"
- "mov z16.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "mov z17.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "mov z18.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "mov z19.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "mov z20.s, #0\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "mov z21.s, #0\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z22.s, #0\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z23.s, #0\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z24.s, #0\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z25.s, #0\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "mov z26.s, #0\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "mov z27.s, #0\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "mov z28.s, #0\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "mov z29.s, #0\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "mov z30.s, #0\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "mov z31.s, #0\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "b 3f\n"
- "1:\n"
- "ld1w z16.s, p0/z, [%[c_ptr0]]\n"
- "ld1w z17.s, p1/z, [%[c_ptr0], #1, MUL VL]\n"
- "ld1w z18.s, p2/z, [%[c_ptr0], #2, MUL VL]\n"
- "ld1w z19.s, p3/z, [%[c_ptr0], #3, MUL VL]\n"
- "ld1w z20.s, p0/z, [c_ptr1]\n"
- "ld1w z21.s, p1/z, [c_ptr1, #1, MUL VL]\n"
- "ld1w z22.s, p2/z, [c_ptr1, #2, MUL VL]\n"
- "ld1w z23.s, p3/z, [c_ptr1, #3, MUL VL]\n"
- "ld1w z24.s, p0/z, [c_ptr2]\n"
- "ld1w z25.s, p1/z, [c_ptr2, #1, MUL VL]\n"
- "ld1w z26.s, p2/z, [c_ptr2, #2, MUL VL]\n"
- "ld1w z27.s, p3/z, [c_ptr2, #3, MUL VL]\n"
- "ld1w z28.s, p0/z, [c_ptr3]\n"
- "ld1w z29.s, p1/z, [c_ptr3, #1, MUL VL]\n"
- "ld1w z30.s, p2/z, [c_ptr3, #2, MUL VL]\n"
- "ld1w z31.s, p3/z, [c_ptr3, #3, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "add %[a_ptr0], %[a_ptr0], #0x10\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "add a_ptr1, a_ptr1, #0x10\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "add a_ptr2, a_ptr2, #0x10\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "add a_ptr3, a_ptr3, #0x10\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "cbz %[loops], 2f\n"
- "3:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "subs %[loops], %[loops], #0x1\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "add %[a_ptr0], %[a_ptr0], #0x20\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "add a_ptr1, a_ptr1, #0x20\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "add a_ptr2, a_ptr2, #0x20\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "add a_ptr3, a_ptr3, #0x20\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0], #-0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1, #-0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p7/z, [a_ptr2, #-0x10]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3, #-0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "udot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "udot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "udot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z28.s, z8.b, z7.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z30.s, z10.b, z7.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z31.s, z11.b, z7.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z28.s, z12.b, z7.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z30.s, z14.b, z7.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "udot z31.s, z15.b, z7.b[3]\n"
- "b.ne 3b\n"
- "2:\n"
- "cbz %[regs], 4f\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p7/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr1]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr2]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr3]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #2\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "addvl a_ptr1, a_ptr1, #2\n"
- "udot z28.s, z8.b, z7.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "addvl a_ptr2, a_ptr2, #2\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "addvl a_ptr3, a_ptr3, #2\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z30.s, z10.b, z7.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "udot z31.s, z11.b, z7.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z28.s, z12.b, z7.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z30.s, z14.b, z7.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "udot z31.s, z15.b, z7.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z28.s, z8.b, z7.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z30.s, z10.b, z7.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z31.s, z11.b, z7.b[2]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z28.s, z12.b, z7.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z30.s, z14.b, z7.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "udot z31.s, z15.b, z7.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "b 5f\n"
- "4:\n"
- "udot z16.s, z8.b, z0.b[0]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z20.s, z8.b, z1.b[0]\n"
- "ld1rqb z4.b, p6/z, [%[a_ptr0]]\n"
- "udot z24.s, z8.b, z2.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr1]\n"
- "udot z28.s, z8.b, z3.b[0]\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "udot z17.s, z9.b, z0.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr2]\n"
- "udot z21.s, z9.b, z1.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr3]\n"
- "udot z25.s, z9.b, z2.b[0]\n"
- "addvl %[a_ptr0], %[a_ptr0], #1\n"
- "udot z29.s, z9.b, z3.b[0]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "udot z18.s, z10.b, z0.b[0]\n"
- "addvl a_ptr1, a_ptr1, #1\n"
- "udot z22.s, z10.b, z1.b[0]\n"
- "addvl a_ptr2, a_ptr2, #1\n"
- "udot z26.s, z10.b, z2.b[0]\n"
- "addvl a_ptr3, a_ptr3, #1\n"
- "udot z30.s, z10.b, z3.b[0]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z19.s, z11.b, z0.b[0]\n"
- "udot z23.s, z11.b, z1.b[0]\n"
- "udot z27.s, z11.b, z2.b[0]\n"
- "udot z31.s, z11.b, z3.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z16.s, z12.b, z0.b[1]\n"
- "udot z20.s, z12.b, z1.b[1]\n"
- "udot z24.s, z12.b, z2.b[1]\n"
- "udot z28.s, z12.b, z3.b[1]\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "udot z17.s, z13.b, z0.b[1]\n"
- "udot z21.s, z13.b, z1.b[1]\n"
- "udot z25.s, z13.b, z2.b[1]\n"
- "udot z29.s, z13.b, z3.b[1]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "udot z18.s, z14.b, z0.b[1]\n"
- "udot z22.s, z14.b, z1.b[1]\n"
- "udot z26.s, z14.b, z2.b[1]\n"
- "udot z30.s, z14.b, z3.b[1]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z19.s, z15.b, z0.b[1]\n"
- "udot z23.s, z15.b, z1.b[1]\n"
- "udot z27.s, z15.b, z2.b[1]\n"
- "udot z31.s, z15.b, z3.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z16.s, z8.b, z0.b[2]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "udot z20.s, z8.b, z1.b[2]\n"
- "udot z24.s, z8.b, z2.b[2]\n"
- "udot z28.s, z8.b, z3.b[2]\n"
- "udot z17.s, z9.b, z0.b[2]\n"
- "udot z21.s, z9.b, z1.b[2]\n"
- "udot z25.s, z9.b, z2.b[2]\n"
- "udot z29.s, z9.b, z3.b[2]\n"
- "udot z18.s, z10.b, z0.b[2]\n"
- "udot z22.s, z10.b, z1.b[2]\n"
- "udot z26.s, z10.b, z2.b[2]\n"
- "udot z30.s, z10.b, z3.b[2]\n"
- "udot z19.s, z11.b, z0.b[2]\n"
- "udot z23.s, z11.b, z1.b[2]\n"
- "udot z27.s, z11.b, z2.b[2]\n"
- "udot z31.s, z11.b, z3.b[2]\n"
- "udot z16.s, z12.b, z0.b[3]\n"
- "udot z20.s, z12.b, z1.b[3]\n"
- "udot z24.s, z12.b, z2.b[3]\n"
- "udot z28.s, z12.b, z3.b[3]\n"
- "udot z17.s, z13.b, z0.b[3]\n"
- "udot z21.s, z13.b, z1.b[3]\n"
- "udot z25.s, z13.b, z2.b[3]\n"
- "udot z29.s, z13.b, z3.b[3]\n"
- "udot z18.s, z14.b, z0.b[3]\n"
- "udot z22.s, z14.b, z1.b[3]\n"
- "udot z26.s, z14.b, z2.b[3]\n"
- "udot z30.s, z14.b, z3.b[3]\n"
- "udot z19.s, z15.b, z0.b[3]\n"
- "udot z23.s, z15.b, z1.b[3]\n"
- "udot z27.s, z15.b, z2.b[3]\n"
- "udot z31.s, z15.b, z3.b[3]\n"
- "cbz %[blocks], 5f\n"
- "ld1b z8.b, p7/z, [%[b_ptr0]]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[0]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "udot z20.s, z8.b, z5.b[0]\n"
- "udot z24.s, z8.b, z6.b[0]\n"
- "udot z28.s, z8.b, z7.b[0]\n"
- "udot z17.s, z9.b, z4.b[0]\n"
- "udot z21.s, z9.b, z5.b[0]\n"
- "udot z25.s, z9.b, z6.b[0]\n"
- "udot z29.s, z9.b, z7.b[0]\n"
- "udot z18.s, z10.b, z4.b[0]\n"
- "udot z22.s, z10.b, z5.b[0]\n"
- "udot z26.s, z10.b, z6.b[0]\n"
- "udot z30.s, z10.b, z7.b[0]\n"
- "udot z19.s, z11.b, z4.b[0]\n"
- "udot z23.s, z11.b, z5.b[0]\n"
- "udot z27.s, z11.b, z6.b[0]\n"
- "udot z31.s, z11.b, z7.b[0]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[1]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "udot z20.s, z12.b, z5.b[1]\n"
- "udot z24.s, z12.b, z6.b[1]\n"
- "udot z28.s, z12.b, z7.b[1]\n"
- "udot z17.s, z13.b, z4.b[1]\n"
- "udot z21.s, z13.b, z5.b[1]\n"
- "udot z25.s, z13.b, z6.b[1]\n"
- "udot z29.s, z13.b, z7.b[1]\n"
- "udot z18.s, z14.b, z4.b[1]\n"
- "udot z22.s, z14.b, z5.b[1]\n"
- "udot z26.s, z14.b, z6.b[1]\n"
- "udot z30.s, z14.b, z7.b[1]\n"
- "udot z19.s, z15.b, z4.b[1]\n"
- "udot z23.s, z15.b, z5.b[1]\n"
- "udot z27.s, z15.b, z6.b[1]\n"
- "udot z31.s, z15.b, z7.b[1]\n"
- "b.eq 5f\n"
- "addvl %[b_ptr0], %[b_ptr0], #16\n"
- "subs %[blocks], %[blocks], #0x1\n"
- "ld1b z8.b, p7/z, [%[b_ptr0], #-8, MUL VL]\n"
- "ld1b z9.b, p7/z, [%[b_ptr0], #-7, MUL VL]\n"
- "ld1b z10.b, p7/z, [%[b_ptr0], #-6, MUL VL]\n"
- "ld1b z11.b, p7/z, [%[b_ptr0], #-5, MUL VL]\n"
- "udot z16.s, z8.b, z4.b[2]\n"
- "udot z20.s, z8.b, z5.b[2]\n"
- "udot z24.s, z8.b, z6.b[2]\n"
- "udot z28.s, z8.b, z7.b[2]\n"
- "udot z17.s, z9.b, z4.b[2]\n"
- "udot z21.s, z9.b, z5.b[2]\n"
- "udot z25.s, z9.b, z6.b[2]\n"
- "udot z29.s, z9.b, z7.b[2]\n"
- "udot z18.s, z10.b, z4.b[2]\n"
- "udot z22.s, z10.b, z5.b[2]\n"
- "udot z26.s, z10.b, z6.b[2]\n"
- "udot z30.s, z10.b, z7.b[2]\n"
- "udot z19.s, z11.b, z4.b[2]\n"
- "udot z23.s, z11.b, z5.b[2]\n"
- "udot z27.s, z11.b, z6.b[2]\n"
- "udot z31.s, z11.b, z7.b[2]\n"
- "b.eq 5f\n"
- "ld1b z12.b, p7/z, [%[b_ptr0], #-4, MUL VL]\n"
- "ld1b z13.b, p7/z, [%[b_ptr0], #-3, MUL VL]\n"
- "ld1b z14.b, p7/z, [%[b_ptr0], #-2, MUL VL]\n"
- "ld1b z15.b, p7/z, [%[b_ptr0], #-1, MUL VL]\n"
- "udot z16.s, z12.b, z4.b[3]\n"
- "udot z20.s, z12.b, z5.b[3]\n"
- "udot z24.s, z12.b, z6.b[3]\n"
- "udot z28.s, z12.b, z7.b[3]\n"
- "udot z17.s, z13.b, z4.b[3]\n"
- "udot z21.s, z13.b, z5.b[3]\n"
- "udot z25.s, z13.b, z6.b[3]\n"
- "udot z29.s, z13.b, z7.b[3]\n"
- "udot z18.s, z14.b, z4.b[3]\n"
- "udot z22.s, z14.b, z5.b[3]\n"
- "udot z26.s, z14.b, z6.b[3]\n"
- "udot z30.s, z14.b, z7.b[3]\n"
- "udot z19.s, z15.b, z4.b[3]\n"
- "udot z23.s, z15.b, z5.b[3]\n"
- "udot z27.s, z15.b, z6.b[3]\n"
- "udot z31.s, z15.b, z7.b[3]\n"
- "5:\n"
- "st1w z16.s, p0, [%[c_ptr0]]\n"
- "st1w z17.s, p1, [%[c_ptr0], #1, MUL VL]\n"
- "st1w z18.s, p2, [%[c_ptr0], #2, MUL VL]\n"
- "st1w z19.s, p3, [%[c_ptr0], #3, MUL VL]\n"
- "addvl %[c_ptr0], %[c_ptr0], #4\n"
- "st1w z20.s, p0, [c_ptr1]\n"
- "st1w z21.s, p1, [c_ptr1, #1, MUL VL]\n"
- "st1w z22.s, p2, [c_ptr1, #2, MUL VL]\n"
- "st1w z23.s, p3, [c_ptr1, #3, MUL VL]\n"
- "st1w z24.s, p0, [c_ptr2]\n"
- "st1w z25.s, p1, [c_ptr2, #1, MUL VL]\n"
- "st1w z26.s, p2, [c_ptr2, #2, MUL VL]\n"
- "st1w z27.s, p3, [c_ptr2, #3, MUL VL]\n"
- "st1w z28.s, p0, [c_ptr3]\n"
- "st1w z29.s, p1, [c_ptr3, #1, MUL VL]\n"
- "st1w z30.s, p2, [c_ptr3, #2, MUL VL]\n"
- "st1w z31.s, p3, [c_ptr3, #3, MUL VL]\n"
- ".unreq a_ptr1\n"
- ".unreq a_ptr2\n"
- ".unreq a_ptr3\n"
- ".unreq c_ptr1\n"
- ".unreq c_ptr2\n"
- ".unreq c_ptr3\n"
- : [a_ptr0] "+r" (a_ptr0), [b_ptr0] "+r" (b_ptr0), [c_ptr0] "+r" (c_ptr0), [loops] "+r" (loops), [regs] "+r" (regs), [temp] "+r" (temp), [blocks] "+r" (blocks)
- : [width] "r" (width), [accumulate] "r" (static_cast<uint64_t>(accumulate)), [lda] "r" (ldab), [ldc] "r" (ldcb), [leftovers] "r" (leftovers)
- : "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31", "x0", "x1", "x2", "x3", "x4", "x5", "cc", "memory"
- );
- break;
- }
-
- }
- }
-}
-
-} // namespace arm_gemm
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
new file mode 100644
index 0000000000..af9de4a6eb
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL.hpp
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#pragma once
+#ifdef __ARM_FEATURE_SVE
+
+#include "../std_transforms_sve.hpp"
+
+#define ARGLIST \
+ unsigned int, const unsigned int *, \
+ IndirectInputArg<uint8_t>, \
+ size_t, size_t, \
+ const uint8_t *, \
+ IndirectOutputArg<uint32_t>, \
+ const uint32_t *, Activation, bool
+
+namespace arm_gemm
+{
+
+// Actual kernel implementations
+void sve_hybrid_u8u32_dot_6x4VL( ARGLIST );
+
+class cls_sve_hybrid_u8u32_dot_6x4VL
+{
+public:
+ typedef uint8_t operand_type;
+ typedef uint32_t result_type;
+
+ typedef void (*kern_type)( ARGLIST );
+
+ /* Kernel blocking parameters */
+ static constexpr unsigned int out_height()
+ {
+ return 6;
+ }
+
+ static unsigned int out_width()
+ {
+ return get_vector_length<uint32_t>() * 4;
+ }
+
+ static constexpr unsigned int k_unroll()
+ {
+ return 4;
+ }
+
+ static constexpr bool supports_accumulate()
+ {
+ return true;
+ }
+
+ StdTransformsSVE<operand_type, result_type, 6, 4, 4> transforms = {};
+
+ // Default to the generic kernel
+ kern_type kernel=sve_hybrid_u8u32_dot_6x4VL;
+
+ cls_sve_hybrid_u8u32_dot_6x4VL(const CPUInfo *)
+ {
+ }
+};
+
+} // namespace arm_gemm
+
+#undef ARGLIST
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
new file mode 100644
index 0000000000..fc8ce636dd
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_hybrid_u8u32_dot_6x4VL/generic.cpp
@@ -0,0 +1,1904 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#ifdef __ARM_FEATURE_SVE
+
+#include "arm_gemm.hpp"
+#include "../../utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+void sve_hybrid_u8u32_dot_6x4VL (
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, size_t N, const uint8_t *B_ptr, IndirectOutputArg<uint32_t> output_arg,
+ const uint32_t *, Activation, bool accumulate
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings = {};
+ const unsigned int *string_lengths = {};
+ size_t N = {};
+ const uint8_t *B_ptr = {};
+ size_t output_offset = {};
+ size_t input_initial_col = {};
+ size_t input_offset = {};
+ } ka;
+
+ unsigned long flags=0;
+ void *output_ptr;
+ void *input_ptr;
+
+ if (output_arg.is_indirect) {
+ output_ptr=(void *)(output_arg.indirect.ptr);
+ ka.output_offset=output_arg.indirect.offset;
+ flags |= 0x4;
+ } else {
+ output_ptr=(void *)(output_arg.direct.base);
+ ka.output_offset=output_arg.direct.stride;
+ }
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ ka.input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ ka.input_offset=A_arg.direct.stride;
+ }
+ if (accumulate) {
+ flags |= 0x1;
+ }
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+ ka.N = N;
+ ka.B_ptr = B_ptr;
+ __asm__ __volatile__(
+ "ptrue p5.b\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 61f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 49f\n"
+ "beq 37f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 25f\n"
+ "beq 13f\n"
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 2f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "b 3f\n"
+ "2:" // Height 1: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "3:" // Height 1: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 4f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "b 5f\n"
+ "4:" // Height 1: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "5:" // Height 1: setup done
+ "mov x12, #0x0\n"
+ "6:" // Height 1: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 7f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "cbnz x12, 8f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "b 8f\n"
+ "7:" // Height 1: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "8:" // Height 1: input setup done
+ "cmp x11, #0x10\n"
+ "ble 10f\n"
+ "9:" // Height 1: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "bgt 9b\n"
+ "10:" // Height 1: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "addvl x14, x14, #4\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "addvl x14, x14, #4\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "ble 11f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "11:" // Height 1: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 6b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "12:" // Height 1: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 3b\n"
+ "b 74f\n"
+ "13:" // Height 2
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 14f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "b 15f\n"
+ "14:" // Height 2: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "15:" // Height 2: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 16f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "b 17f\n"
+ "16:" // Height 2: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "17:" // Height 2: setup done
+ "mov x12, #0x0\n"
+ "18:" // Height 2: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 19f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "cbnz x12, 20f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "b 20f\n"
+ "19:" // Height 2: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "20:" // Height 2: input setup done
+ "cmp x11, #0x10\n"
+ "ble 22f\n"
+ "21:" // Height 2: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "bgt 21b\n"
+ "22:" // Height 2: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "ble 23f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "23:" // Height 2: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 18b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "24:" // Height 2: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 15b\n"
+ "b 74f\n"
+ "25:" // Height 3
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 26f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "add x27, x27, x19, LSL #2\n"
+ "b 27f\n"
+ "26:" // Height 3: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "27:" // Height 3: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 28f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "b 29f\n"
+ "28:" // Height 3: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "29:" // Height 3: setup done
+ "mov x12, #0x0\n"
+ "30:" // Height 3: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 31f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "cbnz x12, 32f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "b 32f\n"
+ "31:" // Height 3: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "32:" // Height 3: input setup done
+ "cmp x11, #0x10\n"
+ "ble 34f\n"
+ "33:" // Height 3: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "cmp x11, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "bgt 33b\n"
+ "34:" // Height 3: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "add x26, x26, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "ble 35f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "35:" // Height 3: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 30b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "36:" // Height 3: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 27b\n"
+ "b 74f\n"
+ "37:" // Height 4
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 38f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "b 39f\n"
+ "38:" // Height 4: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "39:" // Height 4: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 40f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "b 41f\n"
+ "40:" // Height 4: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "41:" // Height 4: setup done
+ "mov x12, #0x0\n"
+ "42:" // Height 4: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 43f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "cbnz x12, 44f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "b 44f\n"
+ "43:" // Height 4: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "44:" // Height 4: input setup done
+ "cmp x11, #0x10\n"
+ "ble 46f\n"
+ "45:" // Height 4: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "bgt 45b\n"
+ "46:" // Height 4: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "ble 47f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "47:" // Height 4: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 42b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "48:" // Height 4: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 39b\n"
+ "b 74f\n"
+ "49:" // Height 5
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 50f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "b 51f\n"
+ "50:" // Height 5: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "51:" // Height 5: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 52f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "b 53f\n"
+ "52:" // Height 5: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "53:" // Height 5: setup done
+ "mov x12, #0x0\n"
+ "54:" // Height 5: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 55f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "cbnz x12, 56f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "b 56f\n"
+ "55:" // Height 5: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "56:" // Height 5: input setup done
+ "cmp x11, #0x10\n"
+ "ble 58f\n"
+ "57:" // Height 5: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x22, x22, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "bgt 57b\n"
+ "58:" // Height 5: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "add x22, x22, #0x10\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "ble 59f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "59:" // Height 5: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 54b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "60:" // Height 5: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 51b\n"
+ "b 74f\n"
+ "61:" // Height 6
+ "ldr x15, [%x[args_ptr], %[offsetof_N]]\n"
+ "ldr x14, [%x[args_ptr], %[offsetof_B_ptr]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_output_offset]]\n"
+ "tbz %x[flags], #2, 62f\n"
+ "ldr x13, [%x[output_ptr], #0x0]\n"
+ "add x13, x13, x19, LSL #2\n"
+ "ldr x9, [%x[output_ptr], #0x8]\n"
+ "ldr x27, [%x[output_ptr], #0x10]\n"
+ "add x9, x9, x19, LSL #2\n"
+ "ldr x25, [%x[output_ptr], #0x18]\n"
+ "ldr x23, [%x[output_ptr], #0x20]\n"
+ "add x27, x27, x19, LSL #2\n"
+ "ldr x21, [%x[output_ptr], #0x28]\n"
+ "add %x[output_ptr], %x[output_ptr], #0x30\n"
+ "add x25, x25, x19, LSL #2\n"
+ "add x23, x23, x19, LSL #2\n"
+ "add x21, x21, x19, LSL #2\n"
+ "b 63f\n"
+ "62:" // Height 6: setup direct output
+ "mov x13, %x[output_ptr]\n"
+ "add x9, x13, x19, LSL #2\n"
+ "add x27, x9, x19, LSL #2\n"
+ "add x25, x27, x19, LSL #2\n"
+ "add x23, x25, x19, LSL #2\n"
+ "add x21, x23, x19, LSL #2\n"
+ "add %x[output_ptr], x21, x19, LSL #2\n"
+ "63:" // Height 6: Column loop
+ "mov x19, #0x0\n"
+ "whilelt p4.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p3.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p2.s, x19, x15\n"
+ "incw x19\n"
+ "whilelt p1.s, x19, x15\n"
+ "tbz %x[flags], #0, 64f\n"
+ "ld1w { z8.s }, p4/Z, [x13]\n"
+ "ld1w { z9.s }, p3/Z, [x13, #1, MUL VL]\n"
+ "ld1w { z10.s }, p2/Z, [x13, #2, MUL VL]\n"
+ "ld1w { z11.s }, p1/Z, [x13, #3, MUL VL]\n"
+ "ld1w { z12.s }, p4/Z, [x9]\n"
+ "ld1w { z13.s }, p3/Z, [x9, #1, MUL VL]\n"
+ "ld1w { z14.s }, p2/Z, [x9, #2, MUL VL]\n"
+ "ld1w { z15.s }, p1/Z, [x9, #3, MUL VL]\n"
+ "ld1w { z16.s }, p4/Z, [x27]\n"
+ "ld1w { z17.s }, p3/Z, [x27, #1, MUL VL]\n"
+ "ld1w { z18.s }, p2/Z, [x27, #2, MUL VL]\n"
+ "ld1w { z19.s }, p1/Z, [x27, #3, MUL VL]\n"
+ "ld1w { z20.s }, p4/Z, [x25]\n"
+ "ld1w { z21.s }, p3/Z, [x25, #1, MUL VL]\n"
+ "ld1w { z22.s }, p2/Z, [x25, #2, MUL VL]\n"
+ "ld1w { z23.s }, p1/Z, [x25, #3, MUL VL]\n"
+ "ld1w { z24.s }, p4/Z, [x23]\n"
+ "ld1w { z25.s }, p3/Z, [x23, #1, MUL VL]\n"
+ "ld1w { z26.s }, p2/Z, [x23, #2, MUL VL]\n"
+ "ld1w { z27.s }, p1/Z, [x23, #3, MUL VL]\n"
+ "ld1w { z28.s }, p4/Z, [x21]\n"
+ "ld1w { z29.s }, p3/Z, [x21, #1, MUL VL]\n"
+ "ld1w { z30.s }, p2/Z, [x21, #2, MUL VL]\n"
+ "ld1w { z31.s }, p1/Z, [x21, #3, MUL VL]\n"
+ "b 65f\n"
+ "64:" // Height 6: no accumulate
+ "mov z8.s, #0x0\n"
+ "mov z9.s, #0x0\n"
+ "mov z10.s, #0x0\n"
+ "mov z11.s, #0x0\n"
+ "mov z12.s, #0x0\n"
+ "mov z13.s, #0x0\n"
+ "mov z14.s, #0x0\n"
+ "mov z15.s, #0x0\n"
+ "mov z16.s, #0x0\n"
+ "mov z17.s, #0x0\n"
+ "mov z18.s, #0x0\n"
+ "mov z19.s, #0x0\n"
+ "mov z20.s, #0x0\n"
+ "mov z21.s, #0x0\n"
+ "mov z22.s, #0x0\n"
+ "mov z23.s, #0x0\n"
+ "mov z24.s, #0x0\n"
+ "mov z25.s, #0x0\n"
+ "mov z26.s, #0x0\n"
+ "mov z27.s, #0x0\n"
+ "mov z28.s, #0x0\n"
+ "mov z29.s, #0x0\n"
+ "mov z30.s, #0x0\n"
+ "mov z31.s, #0x0\n"
+ "65:" // Height 6: setup done
+ "mov x12, #0x0\n"
+ "66:" // Height 6: String loop
+ "ldr x20, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "ldr w11, [x20, x12, LSL #0x2]\n"
+ "tbz %x[flags], #3, 67f\n"
+ "ldr x20, [%x[input_ptr], x12, LSL #0x3]\n"
+ "add x20, x20, x19, LSL #3\n"
+ "ldr x10, [x20, #0x0]\n"
+ "ldr x28, [x20, #0x8]\n"
+ "ldr x26, [x20, #0x10]\n"
+ "ldr x24, [x20, #0x18]\n"
+ "ldr x22, [x20, #0x20]\n"
+ "ldr x20, [x20, #0x28]\n"
+ "cbnz x12, 68f\n"
+ "ldr x19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x10, x10, x19\n"
+ "add x28, x28, x19\n"
+ "add x26, x26, x19\n"
+ "add x24, x24, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 68f\n"
+ "67:" // Height 6: setup direct input
+ "mov x10, %x[input_ptr]\n"
+ "add x28, x10, x19\n"
+ "add x26, x28, x19\n"
+ "add x24, x26, x19\n"
+ "add x22, x24, x19\n"
+ "add x20, x22, x19\n"
+ "68:" // Height 6: input setup done
+ "cmp x11, #0x10\n"
+ "ble 70f\n"
+ "69:" // Height 6: Multiply loop: Main loop head
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "sub x11, x11, #0x10\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x20, x20, #0x10\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "cmp x11, #0x10\n"
+ "udot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "udot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "udot z30.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "udot z31.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #5, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "udot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "udot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #7, MUL VL]\n"
+ "addvl x14, x14, #16\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "udot z30.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-8, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "udot z31.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-7, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "udot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-6, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "udot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-5, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "udot z30.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-4, MUL VL]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "udot z31.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-3, MUL VL]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "udot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #-2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "udot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #-1, MUL VL]\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z30.s, z6.b, z5.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "udot z31.s, z7.b, z5.b[3]\n"
+ "bgt 69b\n"
+ "70:" // Height 6: Multiply loop: Single iteration only
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "whilelt p0.b, XZR, x11\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "ld1rqb { z0.b }, p0/Z, [x10]\n"
+ "udot z8.s, z6.b, z0.b[0]\n"
+ "ld1rqb { z1.b }, p0/Z, [x28]\n"
+ "add x10, x10, #0x10\n"
+ "udot z9.s, z7.b, z0.b[0]\n"
+ "ld1rqb { z2.b }, p0/Z, [x26]\n"
+ "add x28, x28, #0x10\n"
+ "udot z12.s, z6.b, z1.b[0]\n"
+ "ld1rqb { z3.b }, p0/Z, [x24]\n"
+ "add x26, x26, #0x10\n"
+ "udot z16.s, z6.b, z2.b[0]\n"
+ "ld1rqb { z4.b }, p0/Z, [x22]\n"
+ "add x24, x24, #0x10\n"
+ "udot z13.s, z7.b, z1.b[0]\n"
+ "ld1rqb { z5.b }, p0/Z, [x20]\n"
+ "add x22, x22, #0x10\n"
+ "udot z20.s, z6.b, z3.b[0]\n"
+ "add x20, x20, #0x10\n"
+ "udot z17.s, z7.b, z2.b[0]\n"
+ "udot z24.s, z6.b, z4.b[0]\n"
+ "udot z28.s, z6.b, z5.b[0]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z21.s, z7.b, z3.b[0]\n"
+ "udot z25.s, z7.b, z4.b[0]\n"
+ "udot z29.s, z7.b, z5.b[0]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[0]\n"
+ "udot z14.s, z6.b, z1.b[0]\n"
+ "udot z18.s, z6.b, z2.b[0]\n"
+ "udot z22.s, z6.b, z3.b[0]\n"
+ "udot z26.s, z6.b, z4.b[0]\n"
+ "udot z30.s, z6.b, z5.b[0]\n"
+ "udot z11.s, z7.b, z0.b[0]\n"
+ "udot z15.s, z7.b, z1.b[0]\n"
+ "udot z19.s, z7.b, z2.b[0]\n"
+ "udot z23.s, z7.b, z3.b[0]\n"
+ "udot z27.s, z7.b, z4.b[0]\n"
+ "udot z31.s, z7.b, z5.b[0]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[1]\n"
+ "udot z16.s, z6.b, z2.b[1]\n"
+ "udot z20.s, z6.b, z3.b[1]\n"
+ "udot z24.s, z6.b, z4.b[1]\n"
+ "udot z28.s, z6.b, z5.b[1]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[1]\n"
+ "udot z13.s, z7.b, z1.b[1]\n"
+ "udot z17.s, z7.b, z2.b[1]\n"
+ "udot z21.s, z7.b, z3.b[1]\n"
+ "udot z25.s, z7.b, z4.b[1]\n"
+ "udot z29.s, z7.b, z5.b[1]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[1]\n"
+ "udot z14.s, z6.b, z1.b[1]\n"
+ "udot z18.s, z6.b, z2.b[1]\n"
+ "udot z22.s, z6.b, z3.b[1]\n"
+ "udot z26.s, z6.b, z4.b[1]\n"
+ "udot z30.s, z6.b, z5.b[1]\n"
+ "udot z11.s, z7.b, z0.b[1]\n"
+ "udot z15.s, z7.b, z1.b[1]\n"
+ "udot z19.s, z7.b, z2.b[1]\n"
+ "udot z23.s, z7.b, z3.b[1]\n"
+ "udot z27.s, z7.b, z4.b[1]\n"
+ "udot z31.s, z7.b, z5.b[1]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "subs x11, x11, #0x4\n"
+ "udot z12.s, z6.b, z1.b[2]\n"
+ "udot z16.s, z6.b, z2.b[2]\n"
+ "udot z20.s, z6.b, z3.b[2]\n"
+ "udot z24.s, z6.b, z4.b[2]\n"
+ "udot z28.s, z6.b, z5.b[2]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[2]\n"
+ "udot z13.s, z7.b, z1.b[2]\n"
+ "udot z17.s, z7.b, z2.b[2]\n"
+ "udot z21.s, z7.b, z3.b[2]\n"
+ "udot z25.s, z7.b, z4.b[2]\n"
+ "udot z29.s, z7.b, z5.b[2]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[2]\n"
+ "udot z14.s, z6.b, z1.b[2]\n"
+ "udot z18.s, z6.b, z2.b[2]\n"
+ "udot z22.s, z6.b, z3.b[2]\n"
+ "udot z26.s, z6.b, z4.b[2]\n"
+ "udot z30.s, z6.b, z5.b[2]\n"
+ "udot z11.s, z7.b, z0.b[2]\n"
+ "udot z15.s, z7.b, z1.b[2]\n"
+ "udot z19.s, z7.b, z2.b[2]\n"
+ "udot z23.s, z7.b, z3.b[2]\n"
+ "udot z27.s, z7.b, z4.b[2]\n"
+ "udot z31.s, z7.b, z5.b[2]\n"
+ "ble 71f\n"
+ "ld1b { z6.b }, p5/Z, [x14]\n"
+ "udot z8.s, z6.b, z0.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #1, MUL VL]\n"
+ "udot z12.s, z6.b, z1.b[3]\n"
+ "udot z16.s, z6.b, z2.b[3]\n"
+ "udot z20.s, z6.b, z3.b[3]\n"
+ "udot z24.s, z6.b, z4.b[3]\n"
+ "udot z28.s, z6.b, z5.b[3]\n"
+ "ld1b { z6.b }, p5/Z, [x14, #2, MUL VL]\n"
+ "udot z9.s, z7.b, z0.b[3]\n"
+ "udot z13.s, z7.b, z1.b[3]\n"
+ "udot z17.s, z7.b, z2.b[3]\n"
+ "udot z21.s, z7.b, z3.b[3]\n"
+ "udot z25.s, z7.b, z4.b[3]\n"
+ "udot z29.s, z7.b, z5.b[3]\n"
+ "ld1b { z7.b }, p5/Z, [x14, #3, MUL VL]\n"
+ "addvl x14, x14, #4\n"
+ "udot z10.s, z6.b, z0.b[3]\n"
+ "udot z14.s, z6.b, z1.b[3]\n"
+ "udot z18.s, z6.b, z2.b[3]\n"
+ "udot z22.s, z6.b, z3.b[3]\n"
+ "udot z26.s, z6.b, z4.b[3]\n"
+ "udot z30.s, z6.b, z5.b[3]\n"
+ "udot z11.s, z7.b, z0.b[3]\n"
+ "udot z15.s, z7.b, z1.b[3]\n"
+ "udot z19.s, z7.b, z2.b[3]\n"
+ "udot z23.s, z7.b, z3.b[3]\n"
+ "udot z27.s, z7.b, z4.b[3]\n"
+ "udot z31.s, z7.b, z5.b[3]\n"
+ "71:" // Height 6: Multiply loop: multiply skip
+ "prfm pldl1keep, [x10, #0x80]\n"
+ "add x12, x12, #0x1\n"
+ "prfm pldl1keep, [x28, #0x80]\n"
+ "prfm pldl1keep, [x26, #0x80]\n"
+ "prfm pldl1keep, [x24, #0x80]\n"
+ "prfm pldl1keep, [x22, #0x80]\n"
+ "prfm pldl1keep, [x20, #0x80]\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "cmp x12, x19\n"
+ "bne 66b\n"
+ "prfm pstl1keep, [x13, #0x0]\n"
+ "prfm pstl1keep, [x9, #0x0]\n"
+ "prfm pstl1keep, [x27, #0x0]\n"
+ "prfm pstl1keep, [x25, #0x0]\n"
+ "prfm pstl1keep, [x23, #0x0]\n"
+ "prfm pstl1keep, [x21, #0x0]\n"
+ "st1w { z8.s }, p4, [x13]\n"
+ "st1w { z9.s }, p3, [x13, #1, MUL VL]\n"
+ "st1w { z10.s }, p2, [x13, #2, MUL VL]\n"
+ "st1w { z11.s }, p1, [x13, #3, MUL VL]\n"
+ "addvl x13, x13, #4\n"
+ "st1w { z12.s }, p4, [x9]\n"
+ "st1w { z13.s }, p3, [x9, #1, MUL VL]\n"
+ "st1w { z14.s }, p2, [x9, #2, MUL VL]\n"
+ "st1w { z15.s }, p1, [x9, #3, MUL VL]\n"
+ "addvl x9, x9, #4\n"
+ "st1w { z16.s }, p4, [x27]\n"
+ "st1w { z17.s }, p3, [x27, #1, MUL VL]\n"
+ "st1w { z18.s }, p2, [x27, #2, MUL VL]\n"
+ "st1w { z19.s }, p1, [x27, #3, MUL VL]\n"
+ "addvl x27, x27, #4\n"
+ "st1w { z20.s }, p4, [x25]\n"
+ "st1w { z21.s }, p3, [x25, #1, MUL VL]\n"
+ "st1w { z22.s }, p2, [x25, #2, MUL VL]\n"
+ "st1w { z23.s }, p1, [x25, #3, MUL VL]\n"
+ "addvl x25, x25, #4\n"
+ "st1w { z24.s }, p4, [x23]\n"
+ "st1w { z25.s }, p3, [x23, #1, MUL VL]\n"
+ "st1w { z26.s }, p2, [x23, #2, MUL VL]\n"
+ "st1w { z27.s }, p1, [x23, #3, MUL VL]\n"
+ "addvl x23, x23, #4\n"
+ "st1w { z28.s }, p4, [x21]\n"
+ "st1w { z29.s }, p3, [x21, #1, MUL VL]\n"
+ "st1w { z30.s }, p2, [x21, #2, MUL VL]\n"
+ "st1w { z31.s }, p1, [x21, #3, MUL VL]\n"
+ "addvl x21, x21, #4\n"
+ "72:" // Height 6: Writeback done
+ "mov x19, #0x0\n"
+ "incw x19, ALL, MUL #4\n"
+ "subs x15, x15, x19\n"
+ "bgt 63b\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "beq 74f\n"
+ "ldr x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "tbz %x[flags], #3, 73f\n"
+ "add x20, x20, #0x6\n"
+ "str x20, [%x[args_ptr], %[offsetof_input_offset]]\n"
+ "b 1b\n"
+ "73:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, x20, %x[input_ptr]\n"
+ "b 1b\n"
+ "74:" // Exit
+
+ : [M] "+r" (M), [input_ptr] "+r" (input_ptr), [output_ptr] "+r" (output_ptr)
+ : [args_ptr] "r" (&ka), [flags] "r" (flags), [offsetof_B_ptr] "I" (offsetof(KernelArgs, B_ptr)), [offsetof_N] "I" (offsetof(KernelArgs, N)), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_input_offset] "I" (offsetof(KernelArgs, input_offset)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_output_offset] "I" (offsetof(KernelArgs, output_offset)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths))
+ : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x9", "x10", "x11", "x12", "x13", "x14", "x15", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31"
+ );
+}
+
+} // namespace arm_gemm
+#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
index 43107e45fa..12bb758b68 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_dot_3VLx8 {
+class cls_sve_interleaved_bf16fp32_dot_8x3VL {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 2, 1> transforms = {};
- kern_type kernel=sve_interleaved_bf16fp32_dot_3VLx8;
+ kern_type kernel=sve_interleaved_bf16fp32_dot_8x3VL;
- interleaved_bf16fp32_dot_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_bf16fp32_dot_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
index 7e20ed0971..adee900337 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_bf16fp32_dot_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_dot_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
index f1353e2086..2889dd7f0f 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *, const bfloat16 *, float *, int, int, int);
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *, const bfloat16 *, float *, int, int, int);
-class interleaved_bf16fp32_mmla_3VLx8 {
+class cls_sve_interleaved_bf16fp32_mmla_8x3VL {
public:
typedef bfloat16 operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 4, 2> transforms = {};
- kern_type kernel=sve_interleaved_bf16fp32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_bf16fp32_mmla_8x3VL;
- interleaved_bf16fp32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_bf16fp32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
index 16cc69b2a6..e43404e608 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_bf16fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_bf16fp32_mmla_3VLx8(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_bf16fp32_mmla_8x3VL(const bfloat16 *Apanel, const bfloat16 *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const bfloat16 *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
index 816c0cd095..eb946d9dfa 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *, const __fp16 *, __fp16 *, int, int, int);
-class interleaved_fp16_mla_3VLx8 {
+class cls_sve_interleaved_fp16_mla_8x3VL {
public:
typedef __fp16 operand_type;
typedef __fp16 result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
- kern_type kernel=sve_interleaved_fp16_mla_3VLx8;
+ kern_type kernel=sve_interleaved_fp16_mla_8x3VL;
- interleaved_fp16_mla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_fp16_mla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
index f2050cbd56..46b8770409 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp16_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_fp16_mla_3VLx8(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp16_mla_8x3VL(const __fp16 *Apanel, const __fp16 *Bpanel, __fp16 *Cpanel, int ablocks, int bblocks, int K) {
const __fp16 *a_ptr = Apanel;
__fp16 *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
index cce90fb135..b84ba83b6a 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_fp32_mla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mla_8x3VL(const float *, const float *, float *, int, int, int);
-class interleaved_fp32_mla_3VLx8 {
+class cls_sve_interleaved_fp32_mla_8x3VL {
public:
typedef float operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 1, 1> transforms = {};
- kern_type kernel=sve_interleaved_fp32_mla_3VLx8;
+ kern_type kernel=sve_interleaved_fp32_mla_8x3VL;
- interleaved_fp32_mla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_fp32_mla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
index cd178c478a..1e05a308b5 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_fp32_mla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
index 4ca43cd5c9..96216960ff 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_fp32_mmla_3VLx8(const float *, const float *, float *, int, int, int);
+void sve_interleaved_fp32_mmla_8x3VL(const float *, const float *, float *, int, int, int);
-class interleaved_fp32_mmla_3VLx8 {
+class cls_sve_interleaved_fp32_mmla_8x3VL {
public:
typedef float operand_type;
typedef float result_type;
@@ -59,9 +59,9 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 2, 2> transforms = {};
- kern_type kernel=sve_interleaved_fp32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_fp32_mmla_8x3VL;
- interleaved_fp32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_fp32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
index a404ae9c82..39daf0ff20 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_fp32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_fp32_mmla_3VLx8(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_fp32_mmla_8x3VL(const float *Apanel, const float *Bpanel, float *Cpanel, int ablocks, int bblocks, int K) {
const float *a_ptr = Apanel;
float *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
index e40ba215b4..3e16915cd4 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class interleaved_s8s32_dot_3VLx8 {
+class cls_sve_interleaved_s8s32_dot_8x3VL {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_s8s32_dot_3VLx8;
+ kern_type kernel=sve_interleaved_s8s32_dot_8x3VL;
- interleaved_s8s32_dot_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_s8s32_dot_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
index cdc70705c5..674c2400bf 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_s8s32_dot_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_dot_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
index 361598d594..02b3451c54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *, const int8_t *, int32_t *, int, int, int);
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *, const int8_t *, int32_t *, int, int, int);
-class interleaved_s8s32_mmla_3VLx8 {
+class cls_sve_interleaved_s8s32_mmla_8x3VL {
public:
typedef int8_t operand_type;
typedef int32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_s8s32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_s8s32_mmla_8x3VL;
- interleaved_s8s32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_s8s32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
index cde9ec32e9..578aa01732 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_s8s32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_s8s32_mmla_3VLx8(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_s8s32_mmla_8x3VL(const int8_t *Apanel, const int8_t *Bpanel, int32_t *Cpanel, int ablocks, int bblocks, int K) {
const int8_t *a_ptr = Apanel;
int32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
index 252f38ec63..832a224199 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class interleaved_u8u32_dot_3VLx8 {
+class cls_sve_interleaved_u8u32_dot_8x3VL {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 3, 4, 1, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_u8u32_dot_3VLx8;
+ kern_type kernel=sve_interleaved_u8u32_dot_8x3VL;
- interleaved_u8u32_dot_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_u8u32_dot_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
index 6626f8463b..891869c767 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_dot_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_u8u32_dot_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_dot_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
index ed44a9d8fc..4fdaab84bd 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL.hpp
@@ -31,9 +31,9 @@
namespace arm_gemm {
// Actual kernel implementations
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *, const uint8_t *, uint32_t *, int, int, int);
-class interleaved_u8u32_mmla_3VLx8 {
+class cls_sve_interleaved_u8u32_mmla_8x3VL {
public:
typedef uint8_t operand_type;
typedef uint32_t result_type;
@@ -58,10 +58,11 @@ public:
// Use the standard fixed size transforms.
StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2> transforms = {};
+ StdTransformsSVE<operand_type, result_type, 8, 6, 8, 2, true> transforms_quantized = {};
- kern_type kernel=sve_interleaved_u8u32_mmla_3VLx8;
+ kern_type kernel=sve_interleaved_u8u32_mmla_8x3VL;
- interleaved_u8u32_mmla_3VLx8(const CPUInfo *)
+ cls_sve_interleaved_u8u32_mmla_8x3VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
index 81a1dbcf51..fa08a9d091 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_3VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_interleaved_u8u32_mmla_8x3VL/generic.cpp
@@ -28,7 +28,7 @@
namespace arm_gemm {
-void sve_interleaved_u8u32_mmla_3VLx8(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
+void sve_interleaved_u8u32_mmla_8x3VL(const uint8_t *Apanel, const uint8_t *Bpanel, uint32_t *Cpanel, int ablocks, int bblocks, int K) {
const uint8_t *a_ptr = Apanel;
uint32_t *c_ptr = Cpanel;
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
index b555066195..2097d76a54 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
{
// Actual kernel implementations
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *, int, const float *, float *, int, int, int, int, const float *, Activation, bool);
-class smallK_hybrid_fp32_mla_1VLx8
+class cls_sve_smallK_hybrid_fp32_mla_8x1VL
{
public:
typedef float operand_type;
@@ -75,9 +75,9 @@ public:
StdTransformsSVE<operand_type, result_type, 8, 1, 1> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_smallK_hybrid_fp32_mla_1VLx8;
+ kern_type kernel=sve_smallK_hybrid_fp32_mla_8x1VL;
- smallK_hybrid_fp32_mla_1VLx8(const CPUInfo *)
+ cls_sve_smallK_hybrid_fp32_mla_8x1VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
index 5501688054..e07cfa8218 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_fp32_mla_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void sve_smallK_hybrid_fp32_mla_1VLx8(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
+void sve_smallK_hybrid_fp32_mla_8x1VL(const float *A, int lda, const float *B, float *C, int ldc, int M, int N, int K, const float *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)get_vector_length<float>()) - 1;
const long ldab = lda * sizeof(float);
const long ldcb = ldc * sizeof(float);
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
index eef1e4cc65..e50c05ba39 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
{
// Actual kernel implementations
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *, int, const int8_t *, int32_t *, int, int, int, int, const int32_t *, Activation, bool);
-class smallK_hybrid_s8s32_dot_1VLx8
+class cls_sve_smallK_hybrid_s8s32_dot_8x1VL
{
public:
typedef int8_t operand_type;
@@ -75,9 +75,9 @@ public:
StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_smallK_hybrid_s8s32_dot_1VLx8;
+ kern_type kernel=sve_smallK_hybrid_s8s32_dot_8x1VL;
- smallK_hybrid_s8s32_dot_1VLx8(const CPUInfo *)
+ cls_sve_smallK_hybrid_s8s32_dot_8x1VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
index e2fbdcb61b..5770076d04 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_s8s32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *, Activation, bool) {
+void sve_smallK_hybrid_s8s32_dot_8x1VL(const int8_t *A, int lda, const int8_t *B, int32_t *C, int ldc, int M, int N, int K, const int32_t *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)get_vector_length<int32_t>()) - 1;
const long ldab = lda * sizeof(int8_t);
const long ldcb = ldc * sizeof(int32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
"mov z27.s, #0\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
- "cbz %[loops], 2f\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
- "st1w z25.s, p7, [c_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
"st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
"sdot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #5\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "mov z24.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #7\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
"sdot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3228,52 +3705,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -3361,88 +3840,85 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "sdot z25.s, z18.b, z1.b[2]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -3460,7 +3936,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +3943,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +3951,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +3959,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +3976,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +3983,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +3991,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -3656,23 +4133,133 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3751,52 +4338,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -3893,82 +4482,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -3995,12 +4582,10 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z28.s, z20.b, z4.b[0]\n"
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +4594,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +4602,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +4619,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +4626,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +4634,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +4642,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -4206,23 +4793,142 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4301,52 +5007,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -4452,82 +5160,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -4559,7 +5265,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -4568,7 +5273,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -4577,7 +5281,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -4595,7 +5298,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +5305,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +5313,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -4621,7 +5321,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
"sdot z26.s, z19.b, z2.b[3]\n"
@@ -4630,53 +5329,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -4783,23 +5489,151 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z19.b, z5.b[3]\n"
"sdot z30.s, z19.b, z6.b[3]\n"
"sdot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4878,52 +5712,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -5046,82 +5882,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -5162,7 +5996,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -5171,7 +6004,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6021,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6028,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6036,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6044,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6061,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -5241,53 +6068,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -5411,23 +6245,168 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -5506,52 +6485,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +6664,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +6787,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +6804,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +6811,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +6819,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +6827,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -5870,7 +6844,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -5878,7 +6851,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -5887,53 +6859,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -6066,23 +7045,177 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6161,52 +7294,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -6347,82 +7482,80 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"sdot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7626,10 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z25.s, z16.b, z1.b[0]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7638,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7646,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7663,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7670,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7678,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7686,60 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +7881,186 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6844,52 +8140,54 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -7039,83 +8337,81 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "sdot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "sdot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "sdot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "sdot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"sdot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"sdot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8486,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
"sdot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z24.s, z17.b, z0.b[1]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8494,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z17.b, z5.b[1]\n"
"sdot z30.s, z17.b, z6.b[1]\n"
"sdot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"sdot z24.s, z18.b, z0.b[2]\n"
"sdot z25.s, z18.b, z1.b[2]\n"
"sdot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8502,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z18.b, z5.b[2]\n"
"sdot z30.s, z18.b, z6.b[2]\n"
"sdot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"sdot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"sdot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8519,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"sdot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"sdot z25.s, z20.b, z1.b[0]\n"
"sdot z26.s, z20.b, z2.b[0]\n"
"sdot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8526,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z20.b, z5.b[0]\n"
"sdot z30.s, z20.b, z6.b[0]\n"
"sdot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"sdot z24.s, z21.b, z0.b[1]\n"
"sdot z25.s, z21.b, z1.b[1]\n"
"sdot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8534,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z21.b, z5.b[1]\n"
"sdot z30.s, z21.b, z6.b[1]\n"
"sdot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"sdot z24.s, z22.b, z0.b[2]\n"
"sdot z25.s, z22.b, z1.b[2]\n"
"sdot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8542,6 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z22.b, z5.b[2]\n"
"sdot z30.s, z22.b, z6.b[2]\n"
"sdot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"sdot z24.s, z23.b, z0.b[3]\n"
"sdot z25.s, z23.b, z1.b[3]\n"
"sdot z26.s, z23.b, z2.b[3]\n"
@@ -7266,49 +8555,235 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"sdot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"sdot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "sdot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"sdot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"sdot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
"sdot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"sdot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "sdot z30.s, z16.b, z6.b[0]\n"
+ "sdot z31.s, z16.b, z7.b[0]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
+ "sdot z26.s, z17.b, z2.b[1]\n"
+ "sdot z27.s, z17.b, z3.b[1]\n"
+ "sdot z28.s, z17.b, z4.b[1]\n"
+ "sdot z29.s, z17.b, z5.b[1]\n"
+ "sdot z30.s, z17.b, z6.b[1]\n"
+ "sdot z31.s, z17.b, z7.b[1]\n"
+ "sdot z24.s, z18.b, z0.b[2]\n"
+ "sdot z25.s, z18.b, z1.b[2]\n"
+ "sdot z26.s, z18.b, z2.b[2]\n"
+ "sdot z27.s, z18.b, z3.b[2]\n"
+ "sdot z28.s, z18.b, z4.b[2]\n"
+ "sdot z29.s, z18.b, z5.b[2]\n"
+ "sdot z30.s, z18.b, z6.b[2]\n"
+ "sdot z31.s, z18.b, z7.b[2]\n"
+ "sdot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "sdot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "sdot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "sdot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "sdot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "sdot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "sdot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "sdot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "sdot z24.s, z20.b, z0.b[0]\n"
+ "sdot z25.s, z20.b, z1.b[0]\n"
+ "sdot z26.s, z20.b, z2.b[0]\n"
+ "sdot z27.s, z20.b, z3.b[0]\n"
+ "sdot z28.s, z20.b, z4.b[0]\n"
+ "sdot z29.s, z20.b, z5.b[0]\n"
+ "sdot z30.s, z20.b, z6.b[0]\n"
+ "sdot z31.s, z20.b, z7.b[0]\n"
+ "sdot z24.s, z21.b, z0.b[1]\n"
+ "sdot z25.s, z21.b, z1.b[1]\n"
+ "sdot z26.s, z21.b, z2.b[1]\n"
+ "sdot z27.s, z21.b, z3.b[1]\n"
+ "sdot z28.s, z21.b, z4.b[1]\n"
+ "sdot z29.s, z21.b, z5.b[1]\n"
+ "sdot z30.s, z21.b, z6.b[1]\n"
+ "sdot z31.s, z21.b, z7.b[1]\n"
+ "sdot z24.s, z22.b, z0.b[2]\n"
+ "sdot z25.s, z22.b, z1.b[2]\n"
+ "sdot z26.s, z22.b, z2.b[2]\n"
+ "sdot z27.s, z22.b, z3.b[2]\n"
+ "sdot z28.s, z22.b, z4.b[2]\n"
+ "sdot z29.s, z22.b, z5.b[2]\n"
+ "sdot z30.s, z22.b, z6.b[2]\n"
+ "sdot z31.s, z22.b, z7.b[2]\n"
+ "sdot z24.s, z23.b, z0.b[3]\n"
+ "sdot z25.s, z23.b, z1.b[3]\n"
+ "sdot z26.s, z23.b, z2.b[3]\n"
+ "sdot z27.s, z23.b, z3.b[3]\n"
+ "sdot z28.s, z23.b, z4.b[3]\n"
+ "sdot z29.s, z23.b, z5.b[3]\n"
+ "sdot z30.s, z23.b, z6.b[3]\n"
+ "sdot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "sdot z24.s, z16.b, z0.b[0]\n"
+ "sdot z25.s, z16.b, z1.b[0]\n"
+ "sdot z26.s, z16.b, z2.b[0]\n"
+ "sdot z27.s, z16.b, z3.b[0]\n"
+ "sdot z28.s, z16.b, z4.b[0]\n"
+ "sdot z29.s, z16.b, z5.b[0]\n"
"sdot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"sdot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "sdot z24.s, z17.b, z0.b[1]\n"
+ "sdot z25.s, z17.b, z1.b[1]\n"
"sdot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
"sdot z27.s, z17.b, z3.b[1]\n"
"sdot z28.s, z17.b, z4.b[1]\n"
"sdot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_s8s32_dot_1VLx8(const int8_t *A, int lda, const int8_t *B
"sdot z29.s, z23.b, z5.b[3]\n"
"sdot z30.s, z23.b, z6.b[3]\n"
"sdot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
index 70a0b12130..60184be043 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8.hpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL.hpp
@@ -31,9 +31,9 @@ namespace arm_gemm
{
// Actual kernel implementations
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *, int, const uint8_t *, uint32_t *, int, int, int, int, const uint32_t *, Activation, bool);
-class smallK_hybrid_u8u32_dot_1VLx8
+class cls_sve_smallK_hybrid_u8u32_dot_8x1VL
{
public:
typedef uint8_t operand_type;
@@ -75,9 +75,9 @@ public:
StdTransformsSVE<operand_type, result_type, 8, 1, 4> transforms = {};
// Default to the generic kernel
- kern_type kernel=sve_smallK_hybrid_u8u32_dot_1VLx8;
+ kern_type kernel=sve_smallK_hybrid_u8u32_dot_8x1VL;
- smallK_hybrid_u8u32_dot_1VLx8(const CPUInfo *)
+ cls_sve_smallK_hybrid_u8u32_dot_8x1VL(const CPUInfo *)
{
}
diff --git a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
index 1d0b84e788..b980d9b5c2 100644
--- a/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_1VLx8/generic.cpp
+++ b/src/core/NEON/kernels/arm_gemm/kernels/sve_smallK_hybrid_u8u32_dot_8x1VL/generic.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -33,7 +33,7 @@
namespace arm_gemm {
-void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *, Activation, bool) {
+void sve_smallK_hybrid_u8u32_dot_8x1VL(const uint8_t *A, int lda, const uint8_t *B, uint32_t *C, int ldc, int M, int N, int K, const uint32_t *bias, Activation act, bool) {
const long loops_count = iceildiv(N, (int)get_vector_length<uint32_t>()) - 1;
const long ldab = lda * sizeof(uint8_t);
const long ldcb = ldc * sizeof(uint32_t);
@@ -112,55 +112,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
"mov z27.s, #0\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
- "cbz %[loops], 2f\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -186,10 +185,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
@@ -201,6 +199,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
@@ -230,23 +230,34 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -325,112 +336,112 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #2\n"
- "st1w z25.s, p7, [c_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"udot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
- "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
- "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
"prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
"prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
- "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
"prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
- "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
"st1w z25.s, p7, [c_ptr1]\n"
@@ -470,23 +481,42 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -565,48 +595,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
@@ -618,49 +650,46 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"udot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z25.s, z17.b, z1.b[1]\n"
@@ -676,7 +705,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -690,11 +718,12 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
@@ -720,8 +749,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -737,23 +767,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -832,50 +889,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
"mov z28.s, #0\n"
- "ld1rqb z0.b, p6/z, [%[a_ptr0]]\n"
"mov z29.s, #0\n"
- "ld1rqb z1.b, p6/z, [a_ptr1]\n"
"mov z30.s, #0\n"
- "ld1rqb z2.b, p6/z, [a_ptr2]\n"
"mov z31.s, #0\n"
- "ld1rqb z3.b, p6/z, [a_ptr3]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1rqb z4.b, p6/z, [a_ptr4]\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "ld1rqb z5.b, p6/z, [a_ptr5]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p6/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p6/z, [a_ptr7]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"udot z24.s, z18.b, z0.b[2]\n"
@@ -894,50 +953,47 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
"mov z27.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"udot z26.s, z16.b, z2.b[0]\n"
"st1w z28.s, p7, [c_ptr4]\n"
"mov z28.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z27.s, z16.b, z3.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z25.s, z17.b, z1.b[1]\n"
@@ -953,7 +1009,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z30.s, z17.b, z6.b[1]\n"
"prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -962,7 +1017,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"udot z25.s, z19.b, z1.b[3]\n"
"udot z26.s, z19.b, z2.b[3]\n"
@@ -976,14 +1030,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z16.b, z0.b[0]\n"
"st1w z26.s, p7, [c_ptr2]\n"
"mov z26.s, #0\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
"st1w z27.s, p7, [c_ptr3]\n"
@@ -1006,8 +1062,9 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"mov z31.s, #0\n"
"addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -1031,23 +1088,58 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "mov z25.s, #0\n"
+ "mov z26.s, #0\n"
+ "mov z27.s, #0\n"
+ "mov z28.s, #0\n"
+ "mov z29.s, #0\n"
+ "mov z30.s, #0\n"
+ "mov z31.s, #0\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1126,46 +1218,48 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z24.s, z17.b, z0.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -1205,84 +1299,79 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #5\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -1300,7 +1389,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -1313,39 +1401,41 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "mov z28.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
"udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
@@ -1354,6 +1444,8 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -1392,23 +1484,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1487,48 +1638,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -1575,85 +1728,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -1671,7 +1819,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -1679,7 +1826,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -1693,47 +1839,52 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z29.s, p7, [c_ptr5]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr5, c_ptr5, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
"addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -1780,23 +1931,90 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -1875,48 +2093,50 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -1972,86 +2192,81 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "mov z24.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #7\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -2069,7 +2284,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -2077,7 +2291,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -2086,7 +2299,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -2100,47 +2312,135 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -2195,23 +2495,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2290,49 +2583,51 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
@@ -2396,87 +2691,82 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -2494,7 +2784,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -2502,7 +2791,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -2511,7 +2799,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -2520,7 +2807,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"udot z25.s, z23.b, z1.b[3]\n"
"udot z26.s, z23.b, z2.b[3]\n"
@@ -2534,47 +2820,144 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -2637,23 +3020,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -2732,54 +3108,56 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
@@ -2856,88 +3234,84 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -2955,7 +3329,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -2963,7 +3336,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -2972,7 +3344,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -2981,7 +3352,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -2999,7 +3369,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -3007,55 +3376,62 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -3133,23 +3509,124 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #1\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3228,52 +3705,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -3361,88 +3840,85 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
- "udot z25.s, z18.b, z1.b[2]\n"
"addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -3460,7 +3936,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -3468,7 +3943,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -3477,7 +3951,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -3486,7 +3959,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -3504,7 +3976,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -3512,7 +3983,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -3521,53 +3991,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -3656,23 +4133,133 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #2\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -3751,52 +4338,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -3893,82 +4482,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -3995,12 +4582,10 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z28.s, z20.b, z4.b[0]\n"
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -4009,7 +4594,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -4018,7 +4602,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -4036,7 +4619,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -4044,7 +4626,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -4053,7 +4634,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -4062,53 +4642,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -4206,23 +4793,142 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #3\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4301,52 +5007,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -4452,82 +5160,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -4559,7 +5265,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -4568,7 +5273,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -4577,7 +5281,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -4595,7 +5298,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -4603,7 +5305,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -4612,7 +5313,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -4621,7 +5321,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"udot z25.s, z19.b, z1.b[3]\n"
"udot z26.s, z19.b, z2.b[3]\n"
@@ -4630,53 +5329,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -4783,23 +5489,151 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z19.b, z5.b[3]\n"
"udot z30.s, z19.b, z6.b[3]\n"
"udot z31.s, z19.b, z7.b[3]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #4\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -4878,52 +5712,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -5046,82 +5882,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -5162,7 +5996,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -5171,7 +6004,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -5189,7 +6021,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -5197,7 +6028,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -5206,7 +6036,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -5215,7 +6044,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -5233,7 +6061,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -5241,53 +6068,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -5411,23 +6245,168 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #5\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -5506,52 +6485,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -5683,82 +6664,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -5808,7 +6787,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
"udot z25.s, z23.b, z1.b[3]\n"
@@ -5826,7 +6804,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z23.b, z7.b[3]\n"
"ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
@@ -5834,7 +6811,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -5843,7 +6819,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -5852,7 +6827,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -5870,7 +6844,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -5878,7 +6851,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -5887,53 +6859,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -6066,23 +7045,177 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #6\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6161,52 +7294,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -6347,82 +7482,80 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "b.eq 3f\n"
+ "4:\n"
+ "st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
+ "mov z25.s, #0\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
+ "mov z26.s, #0\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "b.eq 3f\n"
- "4:\n"
- "st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "subs %[loops], %[loops], #0x1\n"
- "st1w z25.s, p7, [c_ptr1]\n"
- "addvl %[c_ptr0], %[c_ptr0], #1\n"
- "mov z25.s, #0\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
"udot z24.s, z16.b, z0.b[0]\n"
- "st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
- "mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
@@ -6493,12 +7626,10 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z25.s, z16.b, z1.b[0]\n"
"udot z26.s, z16.b, z2.b[0]\n"
"udot z27.s, z16.b, z3.b[0]\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
"udot z28.s, z16.b, z4.b[0]\n"
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -6507,7 +7638,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -6516,7 +7646,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -6534,7 +7663,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -6542,7 +7670,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -6551,7 +7678,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -6560,53 +7686,60 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.ne 4b\n"
"3:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "mov z24.s, #0\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "mov z24.s, #0\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
- "mov z26.s, #0\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
"addvl c_ptr2, c_ptr2, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"addvl c_ptr6, c_ptr6, #1\n"
- "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
@@ -6748,23 +7881,186 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
+ "b 5f\n"
"2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "mov z31.s, #0\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #7\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
@@ -6844,52 +8140,54 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"add c_ptr1, %[c_ptr0], #0x0\n"
"add a_ptr1, %[a_ptr0], #0x0\n"
"1:\n"
- "mov z24.s, #0\n"
"ptrue p7.b\n"
- "mov z25.s, #0\n"
"whilelt p6.b, %[temp], %[odd_depth]\n"
- "mov z26.s, #0\n"
+ "whilelt p0.s, %[temp], %[last_width]\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "mov z27.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "mov z28.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
"ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "mov z29.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
"ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "mov z30.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "mov z31.s, #0\n"
+ "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
- "whilelt p0.s, %[temp], %[last_width]\n"
"ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x40]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "cbz %[loops], 2f\n"
+ "mov z24.s, #0\n"
"ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x80]\n"
+ "mov z25.s, #0\n"
"ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0xc0]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "mov z26.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x100]\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "mov z28.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
+ "mov z29.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
+ "mov z30.s, #0\n"
"ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "mov z31.s, #0\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "subs %[loops], %[loops], #0x1\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x140]\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "prfm PLDL1KEEP, [a_ptr7, #0x180]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
"udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -7039,83 +8337,81 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "cbz %[loops], 2f\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
- "subs %[loops], %[loops], #0x1\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"b.eq 3f\n"
"4:\n"
"st1w z24.s, p7, [%[c_ptr0]]\n"
"subs %[loops], %[loops], #0x1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
- "st1w z25.s, p7, [c_ptr1]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
+ "st1w z25.s, p7, [c_ptr1]\n"
+ "addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
- "ld1rqb z2.b, p7/z, [a_ptr2]\n"
- "ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
- "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
"mov z27.s, #0\n"
- "ld1rqb z4.b, p7/z, [a_ptr4]\n"
- "udot z26.s, z16.b, z2.b[0]\n"
- "ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
"st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
"mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
- "udot z27.s, z16.b, z3.b[0]\n"
- "ld1rqb z7.b, p7/z, [a_ptr7]\n"
- "udot z25.s, z17.b, z1.b[1]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
"mov z29.s, #0\n"
- "addvl c_ptr2, c_ptr2, #1\n"
- "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
"st1w z30.s, p7, [c_ptr6]\n"
"mov z30.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
- "udot z29.s, z16.b, z5.b[0]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
"st1w z31.s, p7, [c_ptr7]\n"
"mov z31.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
+ "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z17.b, z3.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
+ "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z17.b, z4.b[1]\n"
- "prfm PSTL1KEEP, [%[c_ptr0], #0x40]\n"
+ "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z17.b, z5.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr1, #0x40]\n"
"udot z30.s, z17.b, z6.b[1]\n"
- "prfm PSTL1KEEP, [c_ptr2, #0x40]\n"
"udot z31.s, z17.b, z7.b[1]\n"
"ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr3, #0x40]\n"
"udot z25.s, z18.b, z1.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr4, #0x40]\n"
"udot z26.s, z18.b, z2.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr5, #0x40]\n"
"udot z27.s, z18.b, z3.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr6, #0x40]\n"
"udot z28.s, z18.b, z4.b[2]\n"
- "prfm PSTL1KEEP, [c_ptr7, #0x40]\n"
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
@@ -7190,7 +8486,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
"udot z31.s, z16.b, z7.b[0]\n"
- "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z24.s, z17.b, z0.b[1]\n"
"udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
@@ -7199,7 +8494,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z17.b, z5.b[1]\n"
"udot z30.s, z17.b, z6.b[1]\n"
"udot z31.s, z17.b, z7.b[1]\n"
- "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"udot z24.s, z18.b, z0.b[2]\n"
"udot z25.s, z18.b, z1.b[2]\n"
"udot z26.s, z18.b, z2.b[2]\n"
@@ -7208,7 +8502,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z18.b, z5.b[2]\n"
"udot z30.s, z18.b, z6.b[2]\n"
"udot z31.s, z18.b, z7.b[2]\n"
- "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
"udot z24.s, z19.b, z0.b[3]\n"
"ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
"udot z25.s, z19.b, z1.b[3]\n"
@@ -7226,7 +8519,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z31.s, z19.b, z7.b[3]\n"
"ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
"udot z24.s, z20.b, z0.b[0]\n"
- "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"udot z25.s, z20.b, z1.b[0]\n"
"udot z26.s, z20.b, z2.b[0]\n"
"udot z27.s, z20.b, z3.b[0]\n"
@@ -7234,7 +8526,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z20.b, z5.b[0]\n"
"udot z30.s, z20.b, z6.b[0]\n"
"udot z31.s, z20.b, z7.b[0]\n"
- "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
"udot z24.s, z21.b, z0.b[1]\n"
"udot z25.s, z21.b, z1.b[1]\n"
"udot z26.s, z21.b, z2.b[1]\n"
@@ -7243,7 +8534,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z21.b, z5.b[1]\n"
"udot z30.s, z21.b, z6.b[1]\n"
"udot z31.s, z21.b, z7.b[1]\n"
- "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
"udot z24.s, z22.b, z0.b[2]\n"
"udot z25.s, z22.b, z1.b[2]\n"
"udot z26.s, z22.b, z2.b[2]\n"
@@ -7252,7 +8542,6 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z22.b, z5.b[2]\n"
"udot z30.s, z22.b, z6.b[2]\n"
"udot z31.s, z22.b, z7.b[2]\n"
- "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
"udot z24.s, z23.b, z0.b[3]\n"
"udot z25.s, z23.b, z1.b[3]\n"
"udot z26.s, z23.b, z2.b[3]\n"
@@ -7266,49 +8555,235 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"st1w z24.s, p7, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"mov z24.s, #0\n"
- "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
- "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
- "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
"st1w z25.s, p7, [c_ptr1]\n"
"addvl c_ptr1, c_ptr1, #1\n"
"mov z25.s, #0\n"
- "ld1rqb z1.b, p7/z, [a_ptr1]\n"
- "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
"st1w z26.s, p7, [c_ptr2]\n"
+ "addvl c_ptr2, c_ptr2, #1\n"
"mov z26.s, #0\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "st1w z27.s, p7, [c_ptr3]\n"
+ "addvl c_ptr3, c_ptr3, #1\n"
+ "mov z27.s, #0\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "st1w z28.s, p7, [c_ptr4]\n"
+ "addvl c_ptr4, c_ptr4, #1\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "st1w z29.s, p7, [c_ptr5]\n"
+ "addvl c_ptr5, c_ptr5, #1\n"
+ "mov z29.s, #0\n"
"ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "st1w z30.s, p7, [c_ptr6]\n"
+ "mov z30.s, #0\n"
"ld1rqb z3.b, p7/z, [a_ptr3]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"udot z25.s, z16.b, z1.b[0]\n"
- "st1w z27.s, p7, [c_ptr3]\n"
- "mov z27.s, #0\n"
"ld1rqb z4.b, p7/z, [a_ptr4]\n"
"udot z26.s, z16.b, z2.b[0]\n"
+ "st1w z31.s, p7, [c_ptr7]\n"
+ "mov z31.s, #0\n"
"ld1rqb z5.b, p7/z, [a_ptr5]\n"
- "udot z24.s, z17.b, z0.b[1]\n"
- "st1w z28.s, p7, [c_ptr4]\n"
- "mov z28.s, #0\n"
- "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"udot z27.s, z16.b, z3.b[0]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
"ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "addvl c_ptr6, c_ptr6, #1\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
+ "addvl c_ptr7, c_ptr7, #1\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "ld1b z16.b, p7/z, [%[b_ptr0]]\n"
"udot z25.s, z17.b, z1.b[1]\n"
- "st1w z29.s, p7, [c_ptr5]\n"
- "mov z29.s, #0\n"
- "addvl c_ptr3, c_ptr3, #1\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "ld1b z17.b, p7/z, [%[b_ptr0], #1, MUL VL]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "ld1b z18.b, p7/z, [%[b_ptr0], #2, MUL VL]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x10]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x10]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x10]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x10]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x10]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x10]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x10]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x10]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "ld1b z19.b, p7/z, [%[b_ptr0], #3, MUL VL]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "ld1b z20.b, p7/z, [%[b_ptr0], #4, MUL VL]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "ld1b z21.b, p7/z, [%[b_ptr0], #5, MUL VL]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "ld1b z22.b, p7/z, [%[b_ptr0], #6, MUL VL]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0], #0x20]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1, #0x20]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2, #0x20]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3, #0x20]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4, #0x20]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5, #0x20]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6, #0x20]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "ld1b z23.b, p7/z, [%[b_ptr0], #7, MUL VL]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7, #0x20]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "addvl %[b_ptr0], %[b_ptr0], #8\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
"udot z28.s, z16.b, z4.b[0]\n"
- "st1w z30.s, p7, [c_ptr6]\n"
- "mov z30.s, #0\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"udot z29.s, z16.b, z5.b[0]\n"
- "st1w z31.s, p7, [c_ptr7]\n"
+ "udot z30.s, z16.b, z6.b[0]\n"
+ "udot z31.s, z16.b, z7.b[0]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
+ "udot z26.s, z17.b, z2.b[1]\n"
+ "udot z27.s, z17.b, z3.b[1]\n"
+ "udot z28.s, z17.b, z4.b[1]\n"
+ "udot z29.s, z17.b, z5.b[1]\n"
+ "udot z30.s, z17.b, z6.b[1]\n"
+ "udot z31.s, z17.b, z7.b[1]\n"
+ "udot z24.s, z18.b, z0.b[2]\n"
+ "udot z25.s, z18.b, z1.b[2]\n"
+ "udot z26.s, z18.b, z2.b[2]\n"
+ "udot z27.s, z18.b, z3.b[2]\n"
+ "udot z28.s, z18.b, z4.b[2]\n"
+ "udot z29.s, z18.b, z5.b[2]\n"
+ "udot z30.s, z18.b, z6.b[2]\n"
+ "udot z31.s, z18.b, z7.b[2]\n"
+ "udot z24.s, z19.b, z0.b[3]\n"
+ "ld1rqb z0.b, p6/z, [%[a_ptr0], #0x30]\n"
+ "udot z25.s, z19.b, z1.b[3]\n"
+ "ld1rqb z1.b, p6/z, [a_ptr1, #0x30]\n"
+ "udot z26.s, z19.b, z2.b[3]\n"
+ "ld1rqb z2.b, p6/z, [a_ptr2, #0x30]\n"
+ "udot z27.s, z19.b, z3.b[3]\n"
+ "ld1rqb z3.b, p6/z, [a_ptr3, #0x30]\n"
+ "udot z28.s, z19.b, z4.b[3]\n"
+ "ld1rqb z4.b, p6/z, [a_ptr4, #0x30]\n"
+ "udot z29.s, z19.b, z5.b[3]\n"
+ "ld1rqb z5.b, p6/z, [a_ptr5, #0x30]\n"
+ "udot z30.s, z19.b, z6.b[3]\n"
+ "ld1rqb z6.b, p6/z, [a_ptr6, #0x30]\n"
+ "udot z31.s, z19.b, z7.b[3]\n"
+ "ld1rqb z7.b, p6/z, [a_ptr7, #0x30]\n"
+ "udot z24.s, z20.b, z0.b[0]\n"
+ "udot z25.s, z20.b, z1.b[0]\n"
+ "udot z26.s, z20.b, z2.b[0]\n"
+ "udot z27.s, z20.b, z3.b[0]\n"
+ "udot z28.s, z20.b, z4.b[0]\n"
+ "udot z29.s, z20.b, z5.b[0]\n"
+ "udot z30.s, z20.b, z6.b[0]\n"
+ "udot z31.s, z20.b, z7.b[0]\n"
+ "udot z24.s, z21.b, z0.b[1]\n"
+ "udot z25.s, z21.b, z1.b[1]\n"
+ "udot z26.s, z21.b, z2.b[1]\n"
+ "udot z27.s, z21.b, z3.b[1]\n"
+ "udot z28.s, z21.b, z4.b[1]\n"
+ "udot z29.s, z21.b, z5.b[1]\n"
+ "udot z30.s, z21.b, z6.b[1]\n"
+ "udot z31.s, z21.b, z7.b[1]\n"
+ "udot z24.s, z22.b, z0.b[2]\n"
+ "udot z25.s, z22.b, z1.b[2]\n"
+ "udot z26.s, z22.b, z2.b[2]\n"
+ "udot z27.s, z22.b, z3.b[2]\n"
+ "udot z28.s, z22.b, z4.b[2]\n"
+ "udot z29.s, z22.b, z5.b[2]\n"
+ "udot z30.s, z22.b, z6.b[2]\n"
+ "udot z31.s, z22.b, z7.b[2]\n"
+ "udot z24.s, z23.b, z0.b[3]\n"
+ "udot z25.s, z23.b, z1.b[3]\n"
+ "udot z26.s, z23.b, z2.b[3]\n"
+ "udot z27.s, z23.b, z3.b[3]\n"
+ "udot z28.s, z23.b, z4.b[3]\n"
+ "udot z29.s, z23.b, z5.b[3]\n"
+ "udot z30.s, z23.b, z6.b[3]\n"
+ "udot z31.s, z23.b, z7.b[3]\n"
+ "b 5f\n"
+ "2:\n"
+ "mov z24.s, #0\n"
+ "ld1rqb z0.b, p7/z, [%[a_ptr0]]\n"
+ "mov z25.s, #0\n"
+ "ld1rqb z1.b, p7/z, [a_ptr1]\n"
+ "mov z26.s, #0\n"
+ "ld1rqb z2.b, p7/z, [a_ptr2]\n"
+ "mov z27.s, #0\n"
+ "ld1rqb z3.b, p7/z, [a_ptr3]\n"
+ "mov z28.s, #0\n"
+ "ld1rqb z4.b, p7/z, [a_ptr4]\n"
+ "mov z29.s, #0\n"
+ "ld1rqb z5.b, p7/z, [a_ptr5]\n"
+ "mov z30.s, #0\n"
+ "ld1rqb z6.b, p7/z, [a_ptr6]\n"
"mov z31.s, #0\n"
- "addvl c_ptr5, c_ptr5, #1\n"
+ "ld1rqb z7.b, p7/z, [a_ptr7]\n"
+ "udot z24.s, z16.b, z0.b[0]\n"
+ "udot z25.s, z16.b, z1.b[0]\n"
+ "udot z26.s, z16.b, z2.b[0]\n"
+ "udot z27.s, z16.b, z3.b[0]\n"
+ "udot z28.s, z16.b, z4.b[0]\n"
+ "udot z29.s, z16.b, z5.b[0]\n"
"udot z30.s, z16.b, z6.b[0]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"udot z31.s, z16.b, z7.b[0]\n"
"ld1b z16.b, p7/z, [%[b_ptr0]]\n"
+ "udot z24.s, z17.b, z0.b[1]\n"
+ "udot z25.s, z17.b, z1.b[1]\n"
"udot z26.s, z17.b, z2.b[1]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
"udot z27.s, z17.b, z3.b[1]\n"
"udot z28.s, z17.b, z4.b[1]\n"
"udot z29.s, z17.b, z5.b[1]\n"
@@ -7458,23 +8933,16 @@ void sve_smallK_hybrid_u8u32_dot_1VLx8(const uint8_t *A, int lda, const uint8_t
"udot z29.s, z23.b, z5.b[3]\n"
"udot z30.s, z23.b, z6.b[3]\n"
"udot z31.s, z23.b, z7.b[3]\n"
- "2:\n"
+ "5:\n"
"st1w z24.s, p0, [%[c_ptr0]]\n"
"addvl %[c_ptr0], %[c_ptr0], #1\n"
"st1w z25.s, p0, [c_ptr1]\n"
- "addvl c_ptr1, c_ptr1, #1\n"
"st1w z26.s, p0, [c_ptr2]\n"
- "addvl c_ptr2, c_ptr2, #1\n"
"st1w z27.s, p0, [c_ptr3]\n"
- "addvl c_ptr3, c_ptr3, #1\n"
"st1w z28.s, p0, [c_ptr4]\n"
- "addvl c_ptr4, c_ptr4, #1\n"
"st1w z29.s, p0, [c_ptr5]\n"
- "addvl c_ptr5, c_ptr5, #1\n"
"st1w z30.s, p0, [c_ptr6]\n"
- "addvl c_ptr6, c_ptr6, #1\n"
"st1w z31.s, p0, [c_ptr7]\n"
- "addvl c_ptr7, c_ptr7, #1\n"
".unreq a_ptr1\n"
".unreq a_ptr2\n"
".unreq a_ptr3\n"
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index eec842d09f..fdb4f584d8 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -110,7 +110,7 @@ public:
QuantizeWrapper operator=(const QuantizeWrapper &) = delete;
QuantizeWrapper(const GemmArgs &args, const Requantize32 &qp) : _params(qp), _args(args), _barrier(args._maxthreads) {
- GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._nbatches, args._nmulti, Activation(), args._maxthreads, nullptr);
+ GemmArgs newargs = GemmArgs(args._ci, args._Msize, args._Nsize, args._Ksize, args._Ksections, args._nbatches, args._nmulti, args._indirect_input, Activation(), args._maxthreads);
_subgemm = gemm<To, Tgemm>(newargs);
if (_subgemm == nullptr) {
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.cpp b/src/core/NEON/kernels/arm_gemm/quantized.cpp
index cac02cf28e..111d01ed3a 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.cpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.cpp
@@ -301,6 +301,179 @@ void requantize_block_32_int(const Requantize32 &qp, unsigned int width, unsigne
out_ptr1 += 16;
}
+ // We are often quantizing one block of interleaved kernel output at a time - these are three registers
+ // wide. Special case that here.
+ if (regs==3) {
+ regs -= 3;
+
+ int32x4_t v_mul0;
+ int32x4_t v_mul1;
+ int32x4_t v_mul2;
+
+ int32x4_t v_shf0;
+ int32x4_t v_shf1;
+ int32x4_t v_shf2;
+
+ int32x4_t v_shf0l;
+ int32x4_t v_shf1l;
+ int32x4_t v_shf2l;
+
+ if (per_channel) {
+ v_mul0 = vld1q_s32(perch_mul_ptr);
+ v_mul1 = vld1q_s32(perch_mul_ptr + 4);
+ v_mul2 = vld1q_s32(perch_mul_ptr + 8);
+ perch_mul_ptr += 12;
+
+ v_shf0 = vld1q_s32(perch_shift_ptr);
+ v_shf1 = vld1q_s32(perch_shift_ptr + 4);
+ v_shf2 = vld1q_s32(perch_shift_ptr + 8);
+ perch_shift_ptr += 12;
+
+ if (do_left_shift) {
+ v_shf0l = vld1q_s32(perch_shiftl_ptr);
+ v_shf1l = vld1q_s32(perch_shiftl_ptr + 4);
+ v_shf2l = vld1q_s32(perch_shiftl_ptr + 8);
+ perch_shiftl_ptr += 12;
+ }
+ } else {
+ v_mul0=v_mul1=v_mul2=v_mul;
+ v_shf0=v_shf1=v_shf2=v_right_shift;
+ v_shf0l=v_shf1l=v_shf2l=v_left_shift;
+ }
+
+ // Load column pointers
+ int32x4_t v_col0 = vld1q_s32(colptr);
+ int32x4_t v_col1 = vld1q_s32(colptr + 4);
+ int32x4_t v_col2 = vld1q_s32(colptr + 8);
+ colptr += 12;
+
+ // Load input data (row 0);
+ int32x4_t v_in00 = vld1q_s32(in_ptr);
+ int32x4_t v_in01 = vld1q_s32(in_ptr + 4);
+ int32x4_t v_in02 = vld1q_s32(in_ptr + 8);
+ in_ptr += 12;
+
+ // Load input data (row 1);
+ int32x4_t v_in10 = vld1q_s32(in_ptr1);
+ int32x4_t v_in11 = vld1q_s32(in_ptr1 + 4);
+ int32x4_t v_in12 = vld1q_s32(in_ptr1 + 8);
+ in_ptr1 += 12;
+
+ // Add on row bias and column bias
+ v_in00 = vaddq_s32(v_in00, v_row_sum);
+ v_in01 = vaddq_s32(v_in01, v_row_sum);
+ v_in02 = vaddq_s32(v_in02, v_row_sum);
+
+ v_in10 = vaddq_s32(v_in10, v_row_sum1);
+ v_in11 = vaddq_s32(v_in11, v_row_sum1);
+ v_in12 = vaddq_s32(v_in12, v_row_sum1);
+
+ v_in00 = vaddq_s32(v_in00, v_col0);
+ v_in01 = vaddq_s32(v_in01, v_col1);
+ v_in02 = vaddq_s32(v_in02, v_col2);
+
+ v_in10 = vaddq_s32(v_in10, v_col0);
+ v_in11 = vaddq_s32(v_in11, v_col1);
+ v_in12 = vaddq_s32(v_in12, v_col2);
+
+ // Quantize
+
+ // If a left shift is needed it needs to happen first.
+ if (do_left_shift) {
+ v_in00 = vrshlq_s32(v_in00, v_shf0l);
+ v_in01 = vrshlq_s32(v_in01, v_shf1l);
+ v_in02 = vrshlq_s32(v_in02, v_shf2l);
+
+ v_in10 = vrshlq_s32(v_in10, v_shf0l);
+ v_in11 = vrshlq_s32(v_in11, v_shf1l);
+ v_in12 = vrshlq_s32(v_in12, v_shf2l);
+ }
+
+ // Multiply
+ v_in00 = vqrdmulhq_s32(v_in00, v_mul0);
+ v_in01 = vqrdmulhq_s32(v_in01, v_mul1);
+ v_in02 = vqrdmulhq_s32(v_in02, v_mul2);
+
+ v_in10 = vqrdmulhq_s32(v_in10, v_mul0);
+ v_in11 = vqrdmulhq_s32(v_in11, v_mul1);
+ v_in12 = vqrdmulhq_s32(v_in12, v_mul2);
+
+ // Compute and add on corrective offset
+ if (do_shift_correction) {
+ int32x4_t v_temp00 = vandq_s32(v_in00, v_shf0);
+ int32x4_t v_temp01 = vandq_s32(v_in01, v_shf1);
+ int32x4_t v_temp02 = vandq_s32(v_in02, v_shf2);
+
+ int32x4_t v_temp10 = vandq_s32(v_in10, v_shf0);
+ int32x4_t v_temp11 = vandq_s32(v_in11, v_shf1);
+ int32x4_t v_temp12 = vandq_s32(v_in12, v_shf2);
+
+ v_temp00 = vshrq_n_s32(v_temp00, 31);
+ v_temp01 = vshrq_n_s32(v_temp01, 31);
+ v_temp02 = vshrq_n_s32(v_temp02, 31);
+
+ v_temp10 = vshrq_n_s32(v_temp10, 31);
+ v_temp11 = vshrq_n_s32(v_temp11, 31);
+ v_temp12 = vshrq_n_s32(v_temp12, 31);
+
+ v_in00 = vqaddq_s32(v_in00, v_temp00);
+ v_in01 = vqaddq_s32(v_in01, v_temp01);
+ v_in02 = vqaddq_s32(v_in02, v_temp02);
+
+ v_in10 = vqaddq_s32(v_in10, v_temp10);
+ v_in11 = vqaddq_s32(v_in11, v_temp11);
+ v_in12 = vqaddq_s32(v_in12, v_temp12);
+ }
+
+ v_in00 = vrshlq_s32(v_in00, v_shf0);
+ v_in01 = vrshlq_s32(v_in01, v_shf1);
+ v_in02 = vrshlq_s32(v_in02, v_shf2);
+
+ v_in10 = vrshlq_s32(v_in10, v_shf0);
+ v_in11 = vrshlq_s32(v_in11, v_shf1);
+ v_in12 = vrshlq_s32(v_in12, v_shf2);
+
+ v_in00 = vaddq_s32(v_in00, v_c_offset);
+ v_in01 = vaddq_s32(v_in01, v_c_offset);
+ v_in02 = vaddq_s32(v_in02, v_c_offset);
+
+ v_in10 = vaddq_s32(v_in10, v_c_offset);
+ v_in11 = vaddq_s32(v_in11, v_c_offset);
+ v_in12 = vaddq_s32(v_in12, v_c_offset);
+
+ v_in00 = vmaxq_s32(v_in00, v_minval);
+ v_in01 = vmaxq_s32(v_in01, v_minval);
+ v_in02 = vmaxq_s32(v_in02, v_minval);
+
+ v_in10 = vmaxq_s32(v_in10, v_minval);
+ v_in11 = vmaxq_s32(v_in11, v_minval);
+ v_in12 = vmaxq_s32(v_in12, v_minval);
+
+ v_in00 = vminq_s32(v_in00, v_maxval);
+ v_in01 = vminq_s32(v_in01, v_maxval);
+ v_in02 = vminq_s32(v_in02, v_maxval);
+
+ v_in10 = vminq_s32(v_in10, v_maxval);
+ v_in11 = vminq_s32(v_in11, v_maxval);
+ v_in12 = vminq_s32(v_in12, v_maxval);
+
+ int16x8_t v_uz00 = vuzp1q_s16(vreinterpretq_s16_s32(v_in00), vreinterpretq_s16_s32(v_in01));
+ int16x8_t v_uz01 = vuzp1q_s16(vreinterpretq_s16_s32(v_in02), vreinterpretq_s16_s32(v_in02));
+
+ int16x8_t v_uz10 = vuzp1q_s16(vreinterpretq_s16_s32(v_in10), vreinterpretq_s16_s32(v_in11));
+ int16x8_t v_uz11 = vuzp1q_s16(vreinterpretq_s16_s32(v_in12), vreinterpretq_s16_s32(v_in12));
+
+ int8x16_t v_uz0 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz00), vreinterpretq_s8_s16(v_uz01));
+ int8x16_t v_uz1 = vuzp1q_s8(vreinterpretq_s8_s16(v_uz10), vreinterpretq_s8_s16(v_uz11));
+
+ vst1q_lane_s64(reinterpret_cast<int64_t *>(out_ptr), vreinterpretq_s64_s8(v_uz0), 0);
+ vst1q_lane_s32(reinterpret_cast<int32_t *>(out_ptr + 8), vreinterpretq_s32_s8(v_uz0), 2);
+ out_ptr += 12;
+ vst1q_lane_s64(reinterpret_cast<int64_t *>(out_ptr1), vreinterpretq_s64_s8(v_uz1), 0);
+ vst1q_lane_s32(reinterpret_cast<int32_t *>(out_ptr1 + 8), vreinterpretq_s32_s8(v_uz1), 2);
+ out_ptr1 += 12;
+ }
+
while (regs--) {
int32x4_t v_mul0;
int32x4_t v_shf0;
diff --git a/src/core/NEON/kernels/arm_gemm/quantized.hpp b/src/core/NEON/kernels/arm_gemm/quantized.hpp
index b0e0c3b580..3f3443025c 100644
--- a/src/core/NEON/kernels/arm_gemm/quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantized.hpp
@@ -23,6 +23,8 @@
*/
#pragma once
+#include "utils.hpp" // IndirectInputArg
+
namespace arm_gemm {
template<typename Tin, typename Tout>
@@ -39,4 +41,8 @@ void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int h
const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth,
unsigned int multi, unsigned int first_col);
+template<typename T>
+void row_sums_indirect(unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<T> A_arg,
+ size_t M, int32_t *output_ptr, const Requantize32 *qp);
+
} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
new file mode 100644
index 0000000000..5433676558
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_s8.cpp
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "quantized.hpp"
+#include "utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+template<>
+void row_sums_indirect(
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<int8_t> A_arg,
+ size_t M, int32_t *out_ptr, const Requantize32 *qp
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings;
+ const unsigned int *string_lengths;
+ unsigned int input_initial_col;
+ } ka;
+
+ unsigned long flags=0;
+ void *input_ptr;
+ size_t input_offset;
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ input_offset=A_arg.direct.stride;
+ }
+
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+
+ __asm__ __volatile__(
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x19]\n"
+ "neg v2.4s, v2.4s\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 86f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 69f\n"
+ "beq 52f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 35f\n"
+ "beq 18f\n"
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "movi v0.4s, #0x0\n"
+ "mov x9, #0x0\n"
+ "mov x28, #0x0\n"
+ "2:" // Height 1: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 3f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "cbnz x28, 4f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 4f\n"
+ "3:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "4:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "blt 8f\n"
+ "cmp x27, #0x20\n"
+ "blt 7f\n"
+ "5:" // Height 1: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "blt 6f\n"
+ "sadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "6:" // Height 1: Multiply loop: unique 1: no collapse
+ "sadalp v1.8h, v31.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 5b\n"
+ "7:" // Height 1: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "sadalp v1.8h, v31.16b\n"
+ "8:" // Height 1: Multiply loop: Main loop skip
+ "cbz x27, 17f\n"
+ "tbz x27, #3, 12f\n"
+ "ldr d31, [x26], #0x8\n"
+ "tbz x27, #2, 10f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "tbz x27, #1, 9f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "b 16f\n"
+ "9:" // Height 1: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "b 16f\n"
+ "10:" // Height 1: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 11f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "b 16f\n"
+ "11:" // Height 1: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "b 16f\n"
+ "12:" // Height 1: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 14f\n"
+ "ldr s31, [x26], #0x4\n"
+ "tbz x27, #1, 13f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "b 16f\n"
+ "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "b 16f\n"
+ "14:" // Height 1: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 15f\n"
+ "ldr h31, [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "b 16f\n"
+ "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: Done
+ "sadalp v1.8h, v31.16b\n"
+ "17:" // Height 1: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 2b\n"
+ "sadalp v0.4s, v1.8h\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "str s0, [%x[out_ptr]], #0x4\n"
+ "b 104f\n"
+ "18:" // Height 2
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "19:" // Height 2: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "cbnz x28, 21f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "21:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "blt 25f\n"
+ "cmp x27, #0x20\n"
+ "blt 24f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "blt 23f\n"
+ "sadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "sadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "23:" // Height 2: Multiply loop: unique 2: no collapse
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 22b\n"
+ "24:" // Height 2: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "25:" // Height 2: Multiply loop: Main loop skip
+ "cbz x27, 34f\n"
+ "tbz x27, #3, 29f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "tbz x27, #2, 27f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "tbz x27, #1, 26f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "b 33f\n"
+ "26:" // Height 2: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "b 33f\n"
+ "27:" // Height 2: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 28f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "b 33f\n"
+ "28:" // Height 2: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "b 33f\n"
+ "29:" // Height 2: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 31f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "tbz x27, #1, 30f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "b 33f\n"
+ "30:" // Height 2: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "b 33f\n"
+ "31:" // Height 2: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 32f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "b 33f\n"
+ "32:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "33:" // Height 2: Multiply loop: Ragged operand read: Done
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "34:" // Height 2: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 19b\n"
+ "sadalp v0.4s, v1.8h\n"
+ "sadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "str d0, [%x[out_ptr]], #0x8\n"
+ "b 104f\n"
+ "35:" // Height 3
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "36:" // Height 3: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 37f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "cbnz x28, 38f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 38f\n"
+ "37:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "38:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "blt 42f\n"
+ "cmp x27, #0x20\n"
+ "blt 41f\n"
+ "39:" // Height 3: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "blt 40f\n"
+ "sadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "sadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "sadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "40:" // Height 3: Multiply loop: unique 3: no collapse
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 39b\n"
+ "41:" // Height 3: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "add x24, x24, #0x10\n"
+ "42:" // Height 3: Multiply loop: Main loop skip
+ "cbz x27, 51f\n"
+ "tbz x27, #3, 46f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "tbz x27, #2, 44f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "tbz x27, #1, 43f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "b 50f\n"
+ "43:" // Height 3: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "b 50f\n"
+ "44:" // Height 3: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 45f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "b 50f\n"
+ "45:" // Height 3: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "b 50f\n"
+ "46:" // Height 3: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 48f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "tbz x27, #1, 47f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "b 50f\n"
+ "47:" // Height 3: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "b 50f\n"
+ "48:" // Height 3: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 49f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "b 50f\n"
+ "49:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "50:" // Height 3: Multiply loop: Ragged operand read: Done
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "51:" // Height 3: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 36b\n"
+ "sadalp v0.4s, v1.8h\n"
+ "sadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "sadalp v26.4s, v27.8h\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "addp v26.4s, v26.4s, v26.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "str d0, [%x[out_ptr]], #0x8\n"
+ "addp v26.4s, v26.4s, v26.4s\n"
+ "mul v26.4s, v26.4s, v2.4s\n"
+ "str s26, [%x[out_ptr]], #0x4\n"
+ "b 104f\n"
+ "52:" // Height 4
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "53:" // Height 4: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 54f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "ldr x23, [x19, #0x18]\n"
+ "cbnz x28, 55f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 55f\n"
+ "54:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "add x23, x24, %x[input_offset]\n"
+ "55:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "blt 59f\n"
+ "cmp x27, #0x20\n"
+ "blt 58f\n"
+ "56:" // Height 4: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "blt 57f\n"
+ "sadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "sadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "sadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "sadalp v23.4s, v24.8h\n"
+ "movi v24.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "57:" // Height 4: Multiply loop: unique 4: no collapse
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 56b\n"
+ "58:" // Height 4: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "59:" // Height 4: Multiply loop: Main loop skip
+ "cbz x27, 68f\n"
+ "tbz x27, #3, 63f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "tbz x27, #2, 61f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "tbz x27, #1, 60f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "b 67f\n"
+ "60:" // Height 4: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "b 67f\n"
+ "61:" // Height 4: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 62f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "b 67f\n"
+ "62:" // Height 4: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "b 67f\n"
+ "63:" // Height 4: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 65f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "tbz x27, #1, 64f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "b 67f\n"
+ "64:" // Height 4: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "b 67f\n"
+ "65:" // Height 4: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 66f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "b 67f\n"
+ "66:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "67:" // Height 4: Multiply loop: Ragged operand read: Done
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "68:" // Height 4: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 53b\n"
+ "sadalp v0.4s, v1.8h\n"
+ "sadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "sadalp v26.4s, v27.8h\n"
+ "sadalp v23.4s, v24.8h\n"
+ "addp v29.4s, v26.4s, v23.4s\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+ "b 104f\n"
+ "69:" // Height 5
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "70:" // Height 5: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 71f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "ldr x23, [x19, #0x18]\n"
+ "ldr x22, [x19, #0x20]\n"
+ "cbnz x28, 72f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 72f\n"
+ "71:" // Height 5: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "add x23, x24, %x[input_offset]\n"
+ "add x22, x23, %x[input_offset]\n"
+ "72:" // Height 5: input setup done
+ "cmp x27, #0x10\n"
+ "blt 76f\n"
+ "cmp x27, #0x20\n"
+ "blt 75f\n"
+ "73:" // Height 5: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "blt 74f\n"
+ "sadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "sadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "sadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "sadalp v23.4s, v24.8h\n"
+ "movi v24.8h, #0x0\n"
+ "sadalp v20.4s, v21.8h\n"
+ "movi v21.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "74:" // Height 5: Multiply loop: unique 5: no collapse
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "sadalp v21.8h, v19.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 73b\n"
+ "75:" // Height 5: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "sadalp v21.8h, v19.16b\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "76:" // Height 5: Multiply loop: Main loop skip
+ "cbz x27, 85f\n"
+ "tbz x27, #3, 80f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "tbz x27, #2, 78f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "tbz x27, #1, 77f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v19.h }[6], [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v19.b }[14], [x22]\n"
+ "b 84f\n"
+ "77:" // Height 5: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v19.b }[12], [x22]\n"
+ "b 84f\n"
+ "78:" // Height 5: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 79f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v19.h }[4], [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v19.b }[10], [x22]\n"
+ "b 84f\n"
+ "79:" // Height 5: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v19.b }[8], [x22]\n"
+ "b 84f\n"
+ "80:" // Height 5: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 82f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "tbz x27, #1, 81f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "b 84f\n"
+ "81:" // Height 5: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "b 84f\n"
+ "82:" // Height 5: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 83f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "ldr h19, [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "b 84f\n"
+ "83:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "ldr b19, [x22, #0x0]\n"
+ "84:" // Height 5: Multiply loop: Ragged operand read: Done
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "sadalp v21.8h, v19.16b\n"
+ "85:" // Height 5: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 70b\n"
+ "sadalp v0.4s, v1.8h\n"
+ "sadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "sadalp v26.4s, v27.8h\n"
+ "sadalp v23.4s, v24.8h\n"
+ "addp v29.4s, v26.4s, v23.4s\n"
+ "sadalp v20.4s, v21.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "addp v20.4s, v20.4s, v20.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+ "addp v20.4s, v20.4s, v20.4s\n"
+ "mul v20.4s, v20.4s, v2.4s\n"
+ "str s20, [%x[out_ptr]], #0x4\n"
+ "b 104f\n"
+ "86:" // Height 6
+ "movi v1.8h, #0x0\n"
+ "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v18.8h, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "87:" // Height 6: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 88f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "ldr x23, [x19, #0x18]\n"
+ "ldr x22, [x19, #0x20]\n"
+ "ldr x20, [x19, #0x28]\n"
+ "cbnz x28, 89f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 89f\n"
+ "88:" // Height 6: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "add x23, x24, %x[input_offset]\n"
+ "add x22, x23, %x[input_offset]\n"
+ "add x20, x22, %x[input_offset]\n"
+ "89:" // Height 6: input setup done
+ "cmp x27, #0x10\n"
+ "blt 93f\n"
+ "cmp x27, #0x20\n"
+ "blt 92f\n"
+ "90:" // Height 6: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "ldr q16, [x20, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "add x20, x20, #0x10\n"
+ "blt 91f\n"
+ "sadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "sadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "sadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "sadalp v23.4s, v24.8h\n"
+ "movi v24.8h, #0x0\n"
+ "sadalp v20.4s, v21.8h\n"
+ "movi v21.8h, #0x0\n"
+ "sadalp v17.4s, v18.8h\n"
+ "movi v18.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "91:" // Height 6: Multiply loop: unique 6: no collapse
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "sadalp v21.8h, v19.16b\n"
+ "sadalp v18.8h, v16.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 90b\n"
+ "92:" // Height 6: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "ldr q16, [x20, #0x0]\n"
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "sadalp v21.8h, v19.16b\n"
+ "sadalp v18.8h, v16.16b\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "add x20, x20, #0x10\n"
+ "93:" // Height 6: Multiply loop: Main loop skip
+ "cbz x27, 102f\n"
+ "tbz x27, #3, 97f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz x27, #2, 95f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz x27, #1, 94f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v19.h }[6], [x22], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v19.b }[14], [x22]\n"
+ "ld1 { v16.b }[14], [x20]\n"
+ "b 101f\n"
+ "94:" // Height 6: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v19.b }[12], [x22]\n"
+ "ld1 { v16.b }[12], [x20]\n"
+ "b 101f\n"
+ "95:" // Height 6: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 96f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v19.h }[4], [x22], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v19.b }[10], [x22]\n"
+ "ld1 { v16.b }[10], [x20]\n"
+ "b 101f\n"
+ "96:" // Height 6: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v19.b }[8], [x22]\n"
+ "ld1 { v16.b }[8], [x20]\n"
+ "b 101f\n"
+ "97:" // Height 6: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 99f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz x27, #1, 98f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 101f\n"
+ "98:" // Height 6: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 101f\n"
+ "99:" // Height 6: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 100f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "ldr h19, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 101f\n"
+ "100:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "ldr b19, [x22, #0x0]\n"
+ "ldr b16, [x20, #0x0]\n"
+ "101:" // Height 6: Multiply loop: Ragged operand read: Done
+ "sadalp v1.8h, v31.16b\n"
+ "sadalp v30.8h, v28.16b\n"
+ "sadalp v27.8h, v25.16b\n"
+ "sadalp v24.8h, v22.16b\n"
+ "sadalp v21.8h, v19.16b\n"
+ "sadalp v18.8h, v16.16b\n"
+ "102:" // Height 6: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x21\n"
+ "bne 87b\n"
+ "sadalp v0.4s, v1.8h\n"
+ "sadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "sadalp v26.4s, v27.8h\n"
+ "sadalp v23.4s, v24.8h\n"
+ "addp v29.4s, v26.4s, v23.4s\n"
+ "sadalp v20.4s, v21.8h\n"
+ "sadalp v17.4s, v18.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "addp v20.4s, v20.4s, v17.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+ "addp v20.4s, v20.4s, v20.4s\n"
+ "mul v20.4s, v20.4s, v2.4s\n"
+ "str d20, [%x[out_ptr]], #0x8\n"
+ "beq 104f\n"
+ "tbz %x[flags], #3, 103f\n"
+ "add %x[input_offset], %x[input_offset], #0x6\n"
+ "b 1b\n"
+ "103:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+ "b 1b\n"
+ "104:" // Exit
+
+ : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
new file mode 100644
index 0000000000..f5709d92ac
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/rowsum_indirect_u8.cpp
@@ -0,0 +1,1160 @@
+/*
+ * Copyright (c) 2019-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifdef __aarch64__
+
+#include "arm_gemm.hpp"
+#include "quantized.hpp"
+#include "utils.hpp"
+
+#include <cassert>
+
+namespace arm_gemm {
+
+template<>
+void row_sums_indirect(
+ unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg<uint8_t> A_arg,
+ size_t M, int32_t *out_ptr, const Requantize32 *qp
+)
+{
+ struct KernelArgs {
+ unsigned int num_strings;
+ const unsigned int *string_lengths;
+ unsigned int input_initial_col;
+ } ka;
+
+ unsigned long flags=0;
+ void *input_ptr;
+ size_t input_offset;
+
+ if (A_arg.is_indirect) {
+ input_ptr=(void *)(A_arg.indirect.ptr);
+ input_offset=A_arg.indirect.start_row;
+ ka.input_initial_col=A_arg.indirect.start_col;
+ flags |= 0x8;
+ } else {
+ assert(num_strings==1);
+ input_ptr=(void *)(A_arg.direct.base);
+ input_offset=A_arg.direct.stride;
+ }
+
+ ka.num_strings = num_strings;
+ ka.string_lengths = string_lengths;
+
+ __asm__ __volatile__(
+ "add x19, %x[qp], %[b_offset]\n"
+ "ld1r { v2.4s }, [x19]\n"
+ "neg v2.4s, v2.4s\n"
+ "1:" // Row loop
+ "cmp %x[M], #0x6\n"
+ "bge 86f\n"
+ "cmp %x[M], #0x4\n"
+ "bgt 69f\n"
+ "beq 52f\n"
+ "cmp %x[M], #0x2\n"
+ "bgt 35f\n"
+ "beq 18f\n"
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "movi v0.4s, #0x0\n"
+ "mov x9, #0x0\n"
+ "mov x28, #0x0\n"
+ "2:" // Height 1: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 3f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "cbnz x28, 4f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "b 4f\n"
+ "3:" // Height 1: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "4:" // Height 1: input setup done
+ "cmp x27, #0x10\n"
+ "blt 8f\n"
+ "cmp x27, #0x20\n"
+ "blt 7f\n"
+ "5:" // Height 1: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "blt 6f\n"
+ "uadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "6:" // Height 1: Multiply loop: unique 1: no collapse
+ "uadalp v1.8h, v31.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 5b\n"
+ "7:" // Height 1: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "uadalp v1.8h, v31.16b\n"
+ "8:" // Height 1: Multiply loop: Main loop skip
+ "cbz x27, 17f\n"
+ "tbz x27, #3, 12f\n"
+ "ldr d31, [x26], #0x8\n"
+ "tbz x27, #2, 10f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "tbz x27, #1, 9f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "b 16f\n"
+ "9:" // Height 1: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "b 16f\n"
+ "10:" // Height 1: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 11f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "b 16f\n"
+ "11:" // Height 1: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "b 16f\n"
+ "12:" // Height 1: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 14f\n"
+ "ldr s31, [x26], #0x4\n"
+ "tbz x27, #1, 13f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "b 16f\n"
+ "13:" // Height 1: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "b 16f\n"
+ "14:" // Height 1: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 15f\n"
+ "ldr h31, [x26], #0x2\n"
+ "tbz x27, #0, 16f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "b 16f\n"
+ "15:" // Height 1: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "16:" // Height 1: Multiply loop: Ragged operand read: Done
+ "uadalp v1.8h, v31.16b\n"
+ "17:" // Height 1: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 2b\n"
+ "uadalp v0.4s, v1.8h\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "str s0, [%x[out_ptr]], #0x4\n"
+ "b 104f\n"
+ "18:" // Height 2
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "19:" // Height 2: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 20f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "cbnz x28, 21f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "b 21f\n"
+ "20:" // Height 2: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "21:" // Height 2: input setup done
+ "cmp x27, #0x10\n"
+ "blt 25f\n"
+ "cmp x27, #0x20\n"
+ "blt 24f\n"
+ "22:" // Height 2: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "blt 23f\n"
+ "uadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "uadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "23:" // Height 2: Multiply loop: unique 2: no collapse
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 22b\n"
+ "24:" // Height 2: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "25:" // Height 2: Multiply loop: Main loop skip
+ "cbz x27, 34f\n"
+ "tbz x27, #3, 29f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "tbz x27, #2, 27f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "tbz x27, #1, 26f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "b 33f\n"
+ "26:" // Height 2: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "b 33f\n"
+ "27:" // Height 2: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 28f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "b 33f\n"
+ "28:" // Height 2: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "b 33f\n"
+ "29:" // Height 2: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 31f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "tbz x27, #1, 30f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "b 33f\n"
+ "30:" // Height 2: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "b 33f\n"
+ "31:" // Height 2: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 32f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "tbz x27, #0, 33f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "b 33f\n"
+ "32:" // Height 2: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "33:" // Height 2: Multiply loop: Ragged operand read: Done
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "34:" // Height 2: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 19b\n"
+ "uadalp v0.4s, v1.8h\n"
+ "uadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "str d0, [%x[out_ptr]], #0x8\n"
+ "b 104f\n"
+ "35:" // Height 3
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "36:" // Height 3: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 37f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "cbnz x28, 38f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "b 38f\n"
+ "37:" // Height 3: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "38:" // Height 3: input setup done
+ "cmp x27, #0x10\n"
+ "blt 42f\n"
+ "cmp x27, #0x20\n"
+ "blt 41f\n"
+ "39:" // Height 3: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "blt 40f\n"
+ "uadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "uadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "uadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "40:" // Height 3: Multiply loop: unique 3: no collapse
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 39b\n"
+ "41:" // Height 3: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "add x24, x24, #0x10\n"
+ "42:" // Height 3: Multiply loop: Main loop skip
+ "cbz x27, 51f\n"
+ "tbz x27, #3, 46f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "tbz x27, #2, 44f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "tbz x27, #1, 43f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "b 50f\n"
+ "43:" // Height 3: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "b 50f\n"
+ "44:" // Height 3: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 45f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "b 50f\n"
+ "45:" // Height 3: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "b 50f\n"
+ "46:" // Height 3: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 48f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "tbz x27, #1, 47f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "b 50f\n"
+ "47:" // Height 3: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "b 50f\n"
+ "48:" // Height 3: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 49f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "tbz x27, #0, 50f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "b 50f\n"
+ "49:" // Height 3: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "50:" // Height 3: Multiply loop: Ragged operand read: Done
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "51:" // Height 3: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 36b\n"
+ "uadalp v0.4s, v1.8h\n"
+ "uadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "uadalp v26.4s, v27.8h\n"
+ "addp v0.4s, v0.4s, v0.4s\n"
+ "addp v26.4s, v26.4s, v26.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "str d0, [%x[out_ptr]], #0x8\n"
+ "addp v26.4s, v26.4s, v26.4s\n"
+ "mul v26.4s, v26.4s, v2.4s\n"
+ "str s26, [%x[out_ptr]], #0x4\n"
+ "b 104f\n"
+ "52:" // Height 4
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "53:" // Height 4: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 54f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "ldr x23, [x19, #0x18]\n"
+ "cbnz x28, 55f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "b 55f\n"
+ "54:" // Height 4: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "add x23, x24, %x[input_offset]\n"
+ "55:" // Height 4: input setup done
+ "cmp x27, #0x10\n"
+ "blt 59f\n"
+ "cmp x27, #0x20\n"
+ "blt 58f\n"
+ "56:" // Height 4: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "blt 57f\n"
+ "uadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "uadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "uadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "uadalp v23.4s, v24.8h\n"
+ "movi v24.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "57:" // Height 4: Multiply loop: unique 4: no collapse
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 56b\n"
+ "58:" // Height 4: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "add x26, x26, #0x10\n"
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "59:" // Height 4: Multiply loop: Main loop skip
+ "cbz x27, 68f\n"
+ "tbz x27, #3, 63f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "tbz x27, #2, 61f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "tbz x27, #1, 60f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "b 67f\n"
+ "60:" // Height 4: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "b 67f\n"
+ "61:" // Height 4: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 62f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "b 67f\n"
+ "62:" // Height 4: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "b 67f\n"
+ "63:" // Height 4: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 65f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "tbz x27, #1, 64f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "b 67f\n"
+ "64:" // Height 4: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "b 67f\n"
+ "65:" // Height 4: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 66f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "tbz x27, #0, 67f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "b 67f\n"
+ "66:" // Height 4: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "67:" // Height 4: Multiply loop: Ragged operand read: Done
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "68:" // Height 4: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 53b\n"
+ "uadalp v0.4s, v1.8h\n"
+ "uadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "uadalp v26.4s, v27.8h\n"
+ "uadalp v23.4s, v24.8h\n"
+ "addp v29.4s, v26.4s, v23.4s\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+ "b 104f\n"
+ "69:" // Height 5
+ "movi v1.8h, #0x0\n"
+ "ldr w20, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "70:" // Height 5: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 71f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "ldr x23, [x19, #0x18]\n"
+ "ldr x22, [x19, #0x20]\n"
+ "cbnz x28, 72f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "b 72f\n"
+ "71:" // Height 5: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "add x23, x24, %x[input_offset]\n"
+ "add x22, x23, %x[input_offset]\n"
+ "72:" // Height 5: input setup done
+ "cmp x27, #0x10\n"
+ "blt 76f\n"
+ "cmp x27, #0x20\n"
+ "blt 75f\n"
+ "73:" // Height 5: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "blt 74f\n"
+ "uadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "uadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "uadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "uadalp v23.4s, v24.8h\n"
+ "movi v24.8h, #0x0\n"
+ "uadalp v20.4s, v21.8h\n"
+ "movi v21.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "74:" // Height 5: Multiply loop: unique 5: no collapse
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "uadalp v21.8h, v19.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 73b\n"
+ "75:" // Height 5: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "uadalp v21.8h, v19.16b\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "76:" // Height 5: Multiply loop: Main loop skip
+ "cbz x27, 85f\n"
+ "tbz x27, #3, 80f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "tbz x27, #2, 78f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "tbz x27, #1, 77f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v19.h }[6], [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v19.b }[14], [x22]\n"
+ "b 84f\n"
+ "77:" // Height 5: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v19.b }[12], [x22]\n"
+ "b 84f\n"
+ "78:" // Height 5: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 79f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v19.h }[4], [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v19.b }[10], [x22]\n"
+ "b 84f\n"
+ "79:" // Height 5: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v19.b }[8], [x22]\n"
+ "b 84f\n"
+ "80:" // Height 5: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 82f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "tbz x27, #1, 81f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "b 84f\n"
+ "81:" // Height 5: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "b 84f\n"
+ "82:" // Height 5: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 83f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "ldr h19, [x22], #0x2\n"
+ "tbz x27, #0, 84f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "b 84f\n"
+ "83:" // Height 5: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "ldr b19, [x22, #0x0]\n"
+ "84:" // Height 5: Multiply loop: Ragged operand read: Done
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "uadalp v21.8h, v19.16b\n"
+ "85:" // Height 5: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x20\n"
+ "bne 70b\n"
+ "uadalp v0.4s, v1.8h\n"
+ "uadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "uadalp v26.4s, v27.8h\n"
+ "uadalp v23.4s, v24.8h\n"
+ "addp v29.4s, v26.4s, v23.4s\n"
+ "uadalp v20.4s, v21.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "addp v20.4s, v20.4s, v20.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+ "addp v20.4s, v20.4s, v20.4s\n"
+ "mul v20.4s, v20.4s, v2.4s\n"
+ "str s20, [%x[out_ptr]], #0x4\n"
+ "b 104f\n"
+ "86:" // Height 6
+ "movi v1.8h, #0x0\n"
+ "ldr w21, [%x[args_ptr], %[offsetof_num_strings]]\n"
+ "mov x9, #0x0\n"
+ "movi v0.4s, #0x0\n"
+ "mov x28, #0x0\n"
+ "movi v30.8h, #0x0\n"
+ "movi v29.4s, #0x0\n"
+ "movi v27.8h, #0x0\n"
+ "movi v26.4s, #0x0\n"
+ "movi v24.8h, #0x0\n"
+ "movi v23.4s, #0x0\n"
+ "movi v21.8h, #0x0\n"
+ "movi v20.4s, #0x0\n"
+ "movi v18.8h, #0x0\n"
+ "movi v17.4s, #0x0\n"
+ "87:" // Height 6: String loop
+ "ldr x19, [%x[args_ptr], %[offsetof_string_lengths]]\n"
+ "ldr w27, [x19, x28, LSL #0x2]\n"
+ "tbz %x[flags], #3, 88f\n"
+ "ldr x19, [%x[input_ptr], x28, LSL #0x3]\n"
+ "add x19, x19, %x[input_offset], LSL #3\n"
+ "ldr x26, [x19, #0x0]\n"
+ "ldr x25, [x19, #0x8]\n"
+ "ldr x24, [x19, #0x10]\n"
+ "ldr x23, [x19, #0x18]\n"
+ "ldr x22, [x19, #0x20]\n"
+ "ldr x20, [x19, #0x28]\n"
+ "cbnz x28, 89f\n"
+ "ldr w19, [%x[args_ptr], %[offsetof_input_initial_col]]\n"
+ "add x26, x26, x19\n"
+ "add x25, x25, x19\n"
+ "add x24, x24, x19\n"
+ "add x23, x23, x19\n"
+ "add x22, x22, x19\n"
+ "add x20, x20, x19\n"
+ "b 89f\n"
+ "88:" // Height 6: setup direct input
+ "mov x26, %x[input_ptr]\n"
+ "add x25, x26, %x[input_offset]\n"
+ "add x24, x25, %x[input_offset]\n"
+ "add x23, x24, %x[input_offset]\n"
+ "add x22, x23, %x[input_offset]\n"
+ "add x20, x22, %x[input_offset]\n"
+ "89:" // Height 6: input setup done
+ "cmp x27, #0x10\n"
+ "blt 93f\n"
+ "cmp x27, #0x20\n"
+ "blt 92f\n"
+ "90:" // Height 6: Multiply loop: Main loop head
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "ldr q16, [x20, #0x0]\n"
+ "cmp x9, #0x7e\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "add x20, x20, #0x10\n"
+ "blt 91f\n"
+ "uadalp v0.4s, v1.8h\n"
+ "movi v1.8h, #0x0\n"
+ "uadalp v29.4s, v30.8h\n"
+ "movi v30.8h, #0x0\n"
+ "uadalp v26.4s, v27.8h\n"
+ "movi v27.8h, #0x0\n"
+ "uadalp v23.4s, v24.8h\n"
+ "movi v24.8h, #0x0\n"
+ "uadalp v20.4s, v21.8h\n"
+ "movi v21.8h, #0x0\n"
+ "uadalp v17.4s, v18.8h\n"
+ "movi v18.8h, #0x0\n"
+ "mov x9, #0x0\n"
+ "91:" // Height 6: Multiply loop: unique 6: no collapse
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "uadalp v21.8h, v19.16b\n"
+ "uadalp v18.8h, v16.16b\n"
+ "add x9, x9, #0x1\n"
+ "sub x27, x27, #0x10\n"
+ "cmp x27, #0x20\n"
+ "bge 90b\n"
+ "92:" // Height 6: Multiply loop: Single iteration only
+ "sub x27, x27, #0x10\n"
+ "ldr q31, [x26, #0x0]\n"
+ "ldr q28, [x25, #0x0]\n"
+ "ldr q25, [x24, #0x0]\n"
+ "ldr q22, [x23, #0x0]\n"
+ "ldr q19, [x22, #0x0]\n"
+ "ldr q16, [x20, #0x0]\n"
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "uadalp v21.8h, v19.16b\n"
+ "uadalp v18.8h, v16.16b\n"
+ "add x26, x26, #0x10\n"
+ "add x25, x25, #0x10\n"
+ "add x24, x24, #0x10\n"
+ "add x23, x23, #0x10\n"
+ "add x22, x22, #0x10\n"
+ "add x20, x20, #0x10\n"
+ "93:" // Height 6: Multiply loop: Main loop skip
+ "cbz x27, 102f\n"
+ "tbz x27, #3, 97f\n"
+ "ldr d31, [x26], #0x8\n"
+ "ldr d28, [x25], #0x8\n"
+ "ldr d25, [x24], #0x8\n"
+ "ldr d22, [x23], #0x8\n"
+ "ldr d19, [x22], #0x8\n"
+ "ldr d16, [x20], #0x8\n"
+ "tbz x27, #2, 95f\n"
+ "ld1 { v31.s }[2], [x26], #0x4\n"
+ "ld1 { v28.s }[2], [x25], #0x4\n"
+ "ld1 { v25.s }[2], [x24], #0x4\n"
+ "ld1 { v22.s }[2], [x23], #0x4\n"
+ "ld1 { v19.s }[2], [x22], #0x4\n"
+ "ld1 { v16.s }[2], [x20], #0x4\n"
+ "tbz x27, #1, 94f\n"
+ "ld1 { v31.h }[6], [x26], #0x2\n"
+ "ld1 { v28.h }[6], [x25], #0x2\n"
+ "ld1 { v25.h }[6], [x24], #0x2\n"
+ "ld1 { v22.h }[6], [x23], #0x2\n"
+ "ld1 { v19.h }[6], [x22], #0x2\n"
+ "ld1 { v16.h }[6], [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[14], [x26]\n"
+ "ld1 { v28.b }[14], [x25]\n"
+ "ld1 { v25.b }[14], [x24]\n"
+ "ld1 { v22.b }[14], [x23]\n"
+ "ld1 { v19.b }[14], [x22]\n"
+ "ld1 { v16.b }[14], [x20]\n"
+ "b 101f\n"
+ "94:" // Height 6: Multiply loop: Ragged operand read: partial_1_12
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[12], [x26]\n"
+ "ld1 { v28.b }[12], [x25]\n"
+ "ld1 { v25.b }[12], [x24]\n"
+ "ld1 { v22.b }[12], [x23]\n"
+ "ld1 { v19.b }[12], [x22]\n"
+ "ld1 { v16.b }[12], [x20]\n"
+ "b 101f\n"
+ "95:" // Height 6: Multiply loop: Ragged operand read: partial_2_8
+ "tbz x27, #1, 96f\n"
+ "ld1 { v31.h }[4], [x26], #0x2\n"
+ "ld1 { v28.h }[4], [x25], #0x2\n"
+ "ld1 { v25.h }[4], [x24], #0x2\n"
+ "ld1 { v22.h }[4], [x23], #0x2\n"
+ "ld1 { v19.h }[4], [x22], #0x2\n"
+ "ld1 { v16.h }[4], [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[10], [x26]\n"
+ "ld1 { v28.b }[10], [x25]\n"
+ "ld1 { v25.b }[10], [x24]\n"
+ "ld1 { v22.b }[10], [x23]\n"
+ "ld1 { v19.b }[10], [x22]\n"
+ "ld1 { v16.b }[10], [x20]\n"
+ "b 101f\n"
+ "96:" // Height 6: Multiply loop: Ragged operand read: partial_1_8
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[8], [x26]\n"
+ "ld1 { v28.b }[8], [x25]\n"
+ "ld1 { v25.b }[8], [x24]\n"
+ "ld1 { v22.b }[8], [x23]\n"
+ "ld1 { v19.b }[8], [x22]\n"
+ "ld1 { v16.b }[8], [x20]\n"
+ "b 101f\n"
+ "97:" // Height 6: Multiply loop: Ragged operand read: partial_4_0
+ "tbz x27, #2, 99f\n"
+ "ldr s31, [x26], #0x4\n"
+ "ldr s28, [x25], #0x4\n"
+ "ldr s25, [x24], #0x4\n"
+ "ldr s22, [x23], #0x4\n"
+ "ldr s19, [x22], #0x4\n"
+ "ldr s16, [x20], #0x4\n"
+ "tbz x27, #1, 98f\n"
+ "ld1 { v31.h }[2], [x26], #0x2\n"
+ "ld1 { v28.h }[2], [x25], #0x2\n"
+ "ld1 { v25.h }[2], [x24], #0x2\n"
+ "ld1 { v22.h }[2], [x23], #0x2\n"
+ "ld1 { v19.h }[2], [x22], #0x2\n"
+ "ld1 { v16.h }[2], [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[6], [x26]\n"
+ "ld1 { v28.b }[6], [x25]\n"
+ "ld1 { v25.b }[6], [x24]\n"
+ "ld1 { v22.b }[6], [x23]\n"
+ "ld1 { v19.b }[6], [x22]\n"
+ "ld1 { v16.b }[6], [x20]\n"
+ "b 101f\n"
+ "98:" // Height 6: Multiply loop: Ragged operand read: partial_1_4
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[4], [x26]\n"
+ "ld1 { v28.b }[4], [x25]\n"
+ "ld1 { v25.b }[4], [x24]\n"
+ "ld1 { v22.b }[4], [x23]\n"
+ "ld1 { v19.b }[4], [x22]\n"
+ "ld1 { v16.b }[4], [x20]\n"
+ "b 101f\n"
+ "99:" // Height 6: Multiply loop: Ragged operand read: partial_2_0
+ "tbz x27, #1, 100f\n"
+ "ldr h31, [x26], #0x2\n"
+ "ldr h28, [x25], #0x2\n"
+ "ldr h25, [x24], #0x2\n"
+ "ldr h22, [x23], #0x2\n"
+ "ldr h19, [x22], #0x2\n"
+ "ldr h16, [x20], #0x2\n"
+ "tbz x27, #0, 101f\n"
+ "ld1 { v31.b }[2], [x26]\n"
+ "ld1 { v28.b }[2], [x25]\n"
+ "ld1 { v25.b }[2], [x24]\n"
+ "ld1 { v22.b }[2], [x23]\n"
+ "ld1 { v19.b }[2], [x22]\n"
+ "ld1 { v16.b }[2], [x20]\n"
+ "b 101f\n"
+ "100:" // Height 6: Multiply loop: Ragged operand read: partial_1_0
+ "ldr b31, [x26, #0x0]\n"
+ "ldr b28, [x25, #0x0]\n"
+ "ldr b25, [x24, #0x0]\n"
+ "ldr b22, [x23, #0x0]\n"
+ "ldr b19, [x22, #0x0]\n"
+ "ldr b16, [x20, #0x0]\n"
+ "101:" // Height 6: Multiply loop: Ragged operand read: Done
+ "uadalp v1.8h, v31.16b\n"
+ "uadalp v30.8h, v28.16b\n"
+ "uadalp v27.8h, v25.16b\n"
+ "uadalp v24.8h, v22.16b\n"
+ "uadalp v21.8h, v19.16b\n"
+ "uadalp v18.8h, v16.16b\n"
+ "102:" // Height 6: Multiply loop: No odd multiplies
+ "add x28, x28, #0x1\n"
+ "cmp x28, x21\n"
+ "bne 87b\n"
+ "uadalp v0.4s, v1.8h\n"
+ "uadalp v29.4s, v30.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "uadalp v26.4s, v27.8h\n"
+ "uadalp v23.4s, v24.8h\n"
+ "addp v29.4s, v26.4s, v23.4s\n"
+ "uadalp v20.4s, v21.8h\n"
+ "uadalp v17.4s, v18.8h\n"
+ "addp v0.4s, v0.4s, v29.4s\n"
+ "subs %x[M], %x[M], #0x6\n"
+ "addp v20.4s, v20.4s, v17.4s\n"
+ "mul v0.4s, v0.4s, v2.4s\n"
+ "st1 { v0.4s }, [%x[out_ptr]], #0x10\n"
+ "addp v20.4s, v20.4s, v20.4s\n"
+ "mul v20.4s, v20.4s, v2.4s\n"
+ "str d20, [%x[out_ptr]], #0x8\n"
+ "beq 104f\n"
+ "tbz %x[flags], #3, 103f\n"
+ "add %x[input_offset], %x[input_offset], #0x6\n"
+ "b 1b\n"
+ "103:" // Update direct input
+ "mov x19, #0x6\n"
+ "madd %x[input_ptr], x19, %x[input_offset], %x[input_ptr]\n"
+ "b 1b\n"
+ "104:" // Exit
+
+ : [M] "+r" (M), [input_offset] "+r" (input_offset), [input_ptr] "+r" (input_ptr), [out_ptr] "+r" (out_ptr)
+ : [args_ptr] "r" (&ka), [b_offset] "I" (offsetof(Requantize32, b_offset)), [flags] "r" (flags), [offsetof_input_initial_col] "I" (offsetof(KernelArgs, input_initial_col)), [offsetof_num_strings] "I" (offsetof(KernelArgs, num_strings)), [offsetof_string_lengths] "I" (offsetof(KernelArgs, string_lengths)), [qp] "r" (qp)
+ : "cc", "memory", "v0", "v1", "v2", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28"
+ );
+}
+
+} // namespace arm_gemm
+
+#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
index 1d3aee7911..4669be9993 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_fixed.hpp
@@ -23,8 +23,10 @@
*/
#pragma once
+#include "convolver.hpp"
#include "mergeresults.hpp"
#include "transform.hpp"
+#include "interleave_indirect.hpp"
namespace arm_gemm {
@@ -39,14 +41,26 @@ namespace arm_gemm {
* The optional 'block' parameter is for kernels using dot-product type
* instructions like UDOT and SDOT.
*/
-template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width, unsigned int block=1, bool integrate_sums=false>
class StdTransformsFixed
{
public:
template<typename TIn>
void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
- const int ymax, const int k0, const int kmax) const {
- Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+ const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) const {
+ Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+ }
+
+ template<typename TIn>
+ void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+ const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+ IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+ }
+
+ template<typename TIn>
+ void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+ const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+ ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
diff --git a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
index 13c4c477c6..3256d919ea 100644
--- a/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
+++ b/src/core/NEON/kernels/arm_gemm/std_transforms_sve.hpp
@@ -23,6 +23,7 @@
*/
#pragma once
+#include "convolver.hpp"
#include "mergeresults.hpp"
#include "transform.hpp"
@@ -38,20 +39,32 @@ namespace arm_gemm {
* The optional 'block' parameter is for kernels using dot-product type
* instructions like UDOT and SDOT.
*/
-template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1>
+template<typename TOperand, typename TResult, unsigned int height, unsigned int width_vectors, unsigned int block=1, unsigned int mmla=1, bool integrate_sums=false>
class StdTransformsSVE
{
public:
template<typename TIn>
void PrepareA(TOperand *out, const TIn *in, const int stride, const int y0,
- const int ymax, const int k0, const int kmax) {
- Transform<height, block, false>(out, in, stride, y0, ymax, k0, kmax);
+ const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+ Interleave<height, block, VLType::None>(out, in, stride, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+ }
+
+ template<typename TIn>
+ void PrepareA_indirect(TOperand *out, const TIn * const * const *ptr, size_t stringlen, size_t rounded_stringlen, const int y0,
+ const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+ IndirectInterleave<height, block, VLType::None>(out, ptr, stringlen, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
+ }
+
+ template<typename TIn>
+ void PrepareA_convolution(TOperand *out, const TIn *ptr, size_t stride, const convolver<TIn> &conv, size_t rounded_stringlen,
+ const int y0, const int ymax, const int k0, const int kmax, int32_t row_sum_multiplier) {
+ ConvolutionInterleave<height, block, VLType::None>(out, ptr, stride, conv, rounded_stringlen, y0, ymax, k0, kmax, integrate_sums, row_sum_multiplier);
}
template<typename TIn>
void PrepareB(TOperand *out, const TIn *in, const int stride, const int x0,
const int xmax, const int k0, const int kmax) {
- Transform<width_vectors, block, true, true>(out, in, stride, x0, xmax, k0, kmax);
+ Transform<width_vectors, block, true, VLType::SVE>(out, in, stride, x0, xmax, k0, kmax);
}
template<typename TOut>
diff --git a/src/core/NEON/kernels/arm_gemm/transform.hpp b/src/core/NEON/kernels/arm_gemm/transform.hpp
index c6ea079882..5efeee5d35 100644
--- a/src/core/NEON/kernels/arm_gemm/transform.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transform.hpp
@@ -38,13 +38,13 @@ namespace arm_gemm {
* Need to cope with the work requested in either dimension not actually
* being a multiple of the block sizes.
*/
-template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, bool sve>
+template <unsigned int tIntBy, unsigned int BlockBy, bool Transposed, size_t TOutSize, size_t TInSize, VLType vlt>
struct TransformImpl {
template <typename TOut, typename TIn>
static void Transform(TOut* out, const TIn* const in, const int stride,
const int y0, const int ymax, const int x0, const int xmax) {
// For SVE cases we multiply the interleave factor by the vector length.
- const unsigned int IntBy = tIntBy * (sve ? get_vector_length<TOut>() / BlockBy : 1);
+ const unsigned int IntBy = tIntBy * (vlt == VLType::SVE ? get_vector_length<TOut>() / BlockBy : 1);
const int n_whole_y_blocks = (ymax - y0) / IntBy;
const int y_remainders = (ymax - y0) % IntBy;
@@ -105,13 +105,13 @@ struct TransformImpl {
};
/*****************************************************************************/
-template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, bool sve=false, typename TOut, typename TIn>
+template <unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt=VLType::None, typename TOut, typename TIn>
void Transform(
TOut* out, const TIn* const in, const int stride,
const int k0, const int kmax, const int x0, const int xmax
) {
// Redirect to a specialised implementation predicated on argument size.
- TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), sve>::Transform(
+ TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
out, in, stride, k0, kmax, x0, xmax
);
}
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
deleted file mode 100644
index 2df5d1bd28..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_interleave_6way_32bit.hpp
+++ /dev/null
@@ -1,167 +0,0 @@
-/*
- * Copyright (c) 2017-2018 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __arm__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<6, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint32_t *outptr = reinterpret_cast<uint32_t *>(out);
- const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
- bool first = true;
-
- uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
- for (int y=y0; y<ymax; y+=6) {
- const uint32_t *inptr0 = inptr + y * ldin + k0;
- const uint32_t *inptr1 = inptr0 + ldin;
- const uint32_t *inptr2 = inptr1 + ldin;
- const uint32_t *inptr3 = inptr2 + ldin;
- const uint32_t *inptr4 = inptr3 + ldin;
- const uint32_t *inptr5 = inptr4 + ldin;
-
- //prefetch_2x(inptr0);
- //prefetch_2x(inptr1);
- //prefetch_2x(inptr2);
- //prefetch_2x(inptr3);
- //prefetch_2x(inptr4);
- //prefetch_2x(inptr5);
-
- int x=(kmax-k0);
- for (;(x>7) || first;x-=8) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- /* 'first' forces this to always run at least once, needed if the total size is <=7. */
- if ((y + 5) >= ymax) {
- switch ((y + 5) - ymax) {
- case 4:
- inptr1 = zerobuff;
- // fall through
- case 3:
- inptr2 = zerobuff;
- // fall through
- case 2:
- inptr3 = zerobuff;
- // fall through
- case 1:
- inptr4 = zerobuff;
- // fall through
- case 0:
- inptr5 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- if (first) {
- if (x<=7) {
- break;
- }
-
- first = false;
- }
-
- __asm __volatile (
- // Load up 8 elements (2 vectors) from each of 8 sources.
- "VLD1.32 {d0-d3}, [%[inptr0]]!\n" // q0=A0A1A2A3
- "VLD1.32 {d4-d7}, [%[inptr1]]!\n" // q2=B0B1B2B3
- "VLD1.32 {d8-d11}, [%[inptr2]]!\n" // q4=C0C1C2C3
- "VZIP.32 q0, q4\n" // q0=A0C0A1C1, q4 = A2C2A3C3
- "VLD1.32 {d12-d15}, [%[inptr3]]!\n" // q6=D0D1D2D3
- "VZIP.32 q2, q6\n" // q2=B0D0B1D1, q6 = B2D2B3D3
- "VLD1.32 {d16-d19}, [%[inptr4]]!\n"
- "VLD1.32 {d20-d23}, [%[inptr5]]!\n"
- "VZIP.32 q8, q10\n" // q8=E0F0E1F1, q10 = E2F2E3F3
- ASM_PREFETCH("[%[inptr0], #128]")
- "VZIP.32 q0, q2\n" // q0 = A0B0C0D0, q2 = A1B1C1D1
-
- // Store first elements
- "VST1.32 {d0-d1}, [%[outptr]]!\n"
- "VST1.32 {d16}, [%[outptr]]!\n"
-
- "VZIP.32 q4, q6\n" // q4 = A2B2C2D2, q6 = A3B3C3D3
-
- // Store second elements
- "VST1.32 {d4-d5}, [%[outptr]]!\n"
- "VZIP.32 q1, q5\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "VST1.32 {d17}, [%[outptr]]!\n"
- "VZIP.32 q3, q7\n"
-
- // Store third elements
- "VZIP.32 q9, q11\n"
- "VST1.32 {d8-d9}, [%[outptr]]!\n"
- "VZIP.32 q1, q3\n"
- ASM_PREFETCH("[%[inptr2], #128]")
- "VST1.32 {d20}, [%[outptr]]!\n"
-
- // Store fourth elements
- "VZIP.32 q5, q7\n"
- "VST1.32 {d12-d13}, [%[outptr]]!\n"
- ASM_PREFETCH("[%[inptr3], #128]")
- "VST1.32 {d21}, [%[outptr]]!\n"
-
- // Fifth
- "VST1.32 {d2-d3}, [%[outptr]]!\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "VST1.32 {d18}, [%[outptr]]!\n"
-
- // Sixth
- "VST1.32 {d6-d7}, [%[outptr]]!\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "VST1.32 {d19}, [%[outptr]]!\n"
-
- // Seventh
- "VST1.32 {d10-d11}, [%[outptr]]!\n"
- "VST1.32 {d22}, [%[outptr]]!\n"
-
- // Eighth
- "VST1.32 {d14-d15}, [%[outptr]]!\n"
- "VST1.32 {d23}, [%[outptr]]!\n"
-
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [outptr] "+r" (outptr)
- :
- : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "memory"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- }
- }
-}
-
-#endif // __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
index 8f0b8ae63f..3ce1d328a7 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a32_transpose_interleave_8way_32bit.hpp
@@ -30,22 +30,22 @@
// Generic unblocked transposed 8x32-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
// Redirect to a 16x uint16_t specialisation
- TransformImpl<16, 1, true, 2, 2, false>::Transform(
+ TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
reinterpret_cast<uint16_t *>(out),
reinterpret_cast<const uint16_t *>(in),
stride*2, x0*2, xmax*2, k0, kmax
);
}
-// Generic 12x16-bit sized specialisation
+// Generic 16x16-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
@@ -117,7 +117,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
template <>
template <>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
uint16_t* out, const uint16_t* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
deleted file mode 100644
index 9b6f4de543..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_block16_interleave4_8bit.hpp
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-#include "../utils.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<4, 16, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint8_t *outptr = (uint8_t *)out;
- const uint8_t *inptr = (uint8_t *)in;
-
- uint8_t zerobuff[16] = { 0 };
-
- for (int y=y0; y<ymax; y+=4) {
- const uint8_t *inptr0 = inptr + static_cast<intptr_t>(y) * ldin + k0;
- const uint8_t *inptr1 = inptr0 + ldin;
- const uint8_t *inptr2 = inptr1 + ldin;
- const uint8_t *inptr3 = inptr2 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
-
- int x=(kmax-k0);
- for (;x>15;x-=16) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- if ((y + 3) >= ymax) {
- switch ((y + 3) - ymax) {
- case 2:
- inptr1 = zerobuff;
- // fall through
- case 1:
- inptr2 = zerobuff;
- // fall through
- case 0:
- inptr3 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- __asm __volatile (
- "LDR q0, [%[inptr0]], #16\n"
- ASM_PREFETCH("[%[inptr0], #176]")
- "LDR q1, [%[inptr1]], #16\n"
- ASM_PREFETCH("[%[inptr1], #176]")
- "STP q0, q1, [%[outptr]], #32\n"
- "LDR q0, [%[inptr2]], #16\n"
- ASM_PREFETCH("[%[inptr2], #176]")
- "LDR q1, [%[inptr3]], #16\n"
- ASM_PREFETCH("[%[inptr3], #176]")
- "STP q0, q1, [%[outptr]], #32\n"
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [outptr] "+r" (outptr)
- :
- : "v0", "v1"
- );
- }
-
- if (x>0) {
- /* Need to duplicate this here, in case we didn't run the main loop. */
- if ((y + 3) >= ymax) {
- switch ((y + 3) - ymax) {
- case 2:
- inptr1 = zerobuff;
- // fall through
- case 1:
- inptr2 = zerobuff;
- // fall through
- case 0:
- inptr3 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- /* We have to write out 16 values, copy as many legal values as there are and pad with 0 */
- auto f = [&outptr, x](const uint8_t *&p) {
- for (int i=0; i<16; i++) {
- if (i < x) {
- *outptr++ = *p++;
- } else {
- *outptr++ = 0;
- }
- }
- };
-
- f(inptr0);
- f(inptr1);
- f(inptr2);
- f(inptr3);
- }
- }
-}
-
-#endif // __aarch64__ \ No newline at end of file
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
deleted file mode 100644
index 3d912c4675..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_16bit.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __aarch64__
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-void TransformImpl<8, 1, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint16_t *outptr = (uint16_t *)out;
- const uint16_t *inptr = (const uint16_t *)in;
- bool first=true;
-
- uint16_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
- for (int y=y0; y<ymax; y+=8) {
- const uint16_t *inptr0 = inptr + y * ldin + k0;
- const uint16_t *inptr1 = inptr0 + ldin;
- const uint16_t *inptr2 = inptr1 + ldin;
- const uint16_t *inptr3 = inptr2 + ldin;
- const uint16_t *inptr4 = inptr3 + ldin;
- const uint16_t *inptr5 = inptr4 + ldin;
- const uint16_t *inptr6 = inptr5 + ldin;
- const uint16_t *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;(x>7) || first;x-=8) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- /* 'first' forces this to always run at least once, needed if the total size is <=7. */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- case 6:
- inptr1 = zerobuff;
- // fall through
- case 5:
- inptr2 = zerobuff;
- // fall through
- case 4:
- inptr3 = zerobuff;
- // fall through
- case 3:
- inptr4 = zerobuff;
- // fall through
- case 2:
- inptr5 = zerobuff;
- // fall through
- case 1:
- inptr6 = zerobuff;
- // fall through
- case 0:
- inptr7 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- if (first) {
- if (x <= 7) {
- break;
- }
-
- first = false;
- }
-
- int skippf = (x & 31);
- __asm __volatile (
- // Load up 8 elements (1 vector) from each of 8 sources.
- "CBNZ %w[skippf], 1f\n"
- ASM_PREFETCH("[%[inptr0], #128]")
- ASM_PREFETCH("[%[inptr1], #128]")
- ASM_PREFETCH("[%[inptr2], #128]")
- ASM_PREFETCH("[%[inptr3], #128]")
- "1:\n"
-
- "LDR q0, [%[inptr0]], #16\n" // q0=A0A1A2A3A4A5A6A7
- "LDR q4, [%[inptr4]], #16\n" // q8=E0E1E2E3E4E5E6E7
- "LDR q2, [%[inptr2]], #16\n" // q4=C0C1C2C3...
- "LDR q6, [%[inptr6]], #16\n"
- "ZIP1 v8.8h, v0.8h, v4.8h\n" // q8=A0E0A1E1A2E2A3E3
- "ZIP2 v16.8h, v0.8h, v4.8h\n" // q16=A4E4A5E5A6E6A7E7
- "ZIP1 v9.8h, v2.8h, v6.8h\n" // q9=C0G0C1G1C2G2C3G3
- "ZIP2 v17.8h, v2.8h, v6.8h\n" // q17=C4G4C5G5C6G6C7G7
- "LDR q1, [%[inptr1]], #16\n" // q1=B0B1B2B3B4B5B6B7
- "LDR q5, [%[inptr5]], #16\n"
- "LDR q3, [%[inptr3]], #16\n" // q3=D0D1D2D3....
- "LDR q7, [%[inptr7]], #16\n"
- "ZIP1 v10.8h, v1.8h, v5.8h\n" // q18=B0F0B1F1B2F2B3F3
- "ZIP2 v18.8h, v1.8h, v5.8h\n" // q18=B4F4B5F5B6F6B7F7
- "ZIP1 v11.8h, v3.8h, v7.8h\n" // q19=D0H0D1H1D2H2D3H3
- "ZIP2 v19.8h, v3.8h, v7.8h\n" // q19=D4H4D5H5D6H6D7H7
-
- "ZIP1 v12.8h, v8.8h, v9.8h\n" // q20=A0C0E0G0A1C1E1G1
- "ZIP2 v20.8h, v8.8h, v9.8h\n"
- "ZIP1 v13.8h, v10.8h, v11.8h\n" // q21=B0D0F0H0B1I1F1H1
- "ZIP2 v21.8h, v10.8h, v11.8h\n"
-
- "CBNZ %w[skippf], 2f\n"
- ASM_PREFETCH("[%[inptr4], #112]")
- ASM_PREFETCH("[%[inptr5], #112]")
- ASM_PREFETCH("[%[inptr6], #112]")
- ASM_PREFETCH("[%[inptr7], #112]")
- "2:\n"
-
- "ZIP1 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v30.8h, v16.8h, v17.8h\n"
- "ZIP1 v23.8h, v18.8h, v19.8h\n"
- "ZIP2 v31.8h, v18.8h, v19.8h\n"
-
- "ZIP1 v14.8h, v12.8h, v13.8h\n" // q22=A0B0C0D0E0F0G0H0
- "ZIP2 v15.8h, v12.8h, v13.8h\n" // q23=A1B1C1D1E1F1G1H1
- "STP q14, q15, [%[outptr]], #32\n" // Write back first two elements
-
- "ZIP1 v0.8h, v20.8h, v21.8h\n"
- "ZIP2 v1.8h, v20.8h, v21.8h\n"
- "STP q0, q1, [%[outptr]], #32\n" // Write back next two elements
-
- "ZIP1 v2.8h, v22.8h, v23.8h\n"
- "ZIP2 v3.8h, v22.8h, v23.8h\n"
- "STP q2, q3, [%[outptr]], #32\n" // Write back next two elements
-
- "ZIP1 v4.8h, v30.8h, v31.8h\n"
- "ZIP2 v5.8h, v30.8h, v31.8h\n"
- "STP q4, q5, [%[outptr]], #32\n" // Write back last two elements
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- : [skippf] "r" (skippf)
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
- "v25", "v26", "v27", "v28", "v29", "v30", "v31", "memory"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- *outptr++ = *inptr6++;
- *outptr++ = *inptr7++;
- }
- }
-}
-
-#endif // __aarch64__
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
deleted file mode 100644
index 701d688af2..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint32_t *outptr = (uint32_t *)out;
- const uint32_t *inptr = (uint32_t *)in;
- bool first = true;
-
- uint32_t zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
- for (int y=y0; y<ymax; y+=8) {
- const uint32_t *inptr0 = inptr + y * ldin + k0;
- const uint32_t *inptr1 = inptr0 + ldin;
- const uint32_t *inptr2 = inptr1 + ldin;
- const uint32_t *inptr3 = inptr2 + ldin;
- const uint32_t *inptr4 = inptr3 + ldin;
- const uint32_t *inptr5 = inptr4 + ldin;
- const uint32_t *inptr6 = inptr5 + ldin;
- const uint32_t *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;(x>7) || first;x-=8) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- /* 'first' forces this to always run at least once, needed if the total size is <=7. */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- case 6:
- inptr1 = zerobuff;
- // fall through
- case 5:
- inptr2 = zerobuff;
- // fall through
- case 4:
- inptr3 = zerobuff;
- // fall through
- case 3:
- inptr4 = zerobuff;
- // fall through
- case 2:
- inptr5 = zerobuff;
- // fall through
- case 1:
- inptr6 = zerobuff;
- // fall through
- case 0:
- inptr7 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- if (first) {
- if (x<=7) {
- break;
- }
-
- first = false;
- }
-
- __asm __volatile (
- // Load up 8 elements (2 vectors) from each of 8 sources.
- "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
- "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
- "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
- "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
- ASM_PREFETCH("[%[inptr0], #128]")
- "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
- "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
- "LDP q8, q9, [%[inptr4]], #32\n"
- "LDP q10, q11, [%[inptr5]], #32\n"
- "LDP q12, q13, [%[inptr6]], #32\n"
- "ZIP1 v18.4s, v8.4s, v12.4s\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "LDP q14, q15, [%[inptr7]], #32\n"
- "ZIP1 v19.4s, v10.4s, v14.4s\n"
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
- ASM_PREFETCH("[%[inptr2], #128]")
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP2 v16.4s, v0.4s, v4.4s\n"
- ASM_PREFETCH("[%[inptr3], #128]")
- "ZIP2 v17.4s, v2.4s, v6.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
- "ZIP2 v18.4s, v8.4s, v12.4s\n"
- "ZIP2 v19.4s, v10.4s, v14.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP1 v16.4s, v1.4s, v5.4s\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "ZIP1 v17.4s, v3.4s, v7.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Third element
-
- "ZIP1 v18.4s, v9.4s, v13.4s\n"
- "ZIP1 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Fourth element
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- ASM_PREFETCH("[%[inptr6], #128]")
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP2 v16.4s, v1.4s, v5.4s\n"
- "ZIP2 v17.4s, v3.4s, v7.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Fifth element
-
- "ZIP2 v18.4s, v9.4s, v13.4s\n"
- ASM_PREFETCH("[%[inptr7], #128]")
- "ZIP2 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Sixth element
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Seventh element
-
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Eighth element
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- *outptr++ = *inptr6++;
- *outptr++ = *inptr7++;
- }
- }
-}
-
-#endif // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
deleted file mode 100644
index 2546cc571a..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_block4_8bit.hpp
+++ /dev/null
@@ -1,228 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && !defined(__ARM_FEATURE_SVE)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint8_t *outptr = reinterpret_cast<uint8_t *>(out);
- const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
- bool first = true;
-
- /* Helper functions to copy blocks about used for odd case. */
- class t {
- public:
- static inline void copy_4_inc(uint8_t *&out, const uint8_t *&in) {
- uint32_t *out_word = reinterpret_cast<uint32_t *>(out);
- const uint32_t *in_word = reinterpret_cast<const uint32_t *>(in);
-
- *out_word++ = *in_word++;
-
- out = reinterpret_cast<uint8_t *>(out_word);
- in = reinterpret_cast<const uint8_t *>(in_word);
- }
-
- static inline void copy_pad(uint8_t *&out, const uint8_t *&in, size_t count) {
- for (unsigned int i=0; i<4; i++) {
- if (i < count) {
- *out++ = *in++;
- } else {
- *out++ = 0;
- }
- }
- }
- };
-
- uint8_t zerobuff[64] = { 0 }; // 32 for asm loop plus up to 31 for overflow loop
-
- for (int y=y0; y<ymax; y+=8) {
- const uint8_t *inptr0 = inptr + y * ldin + k0;
- const uint8_t *inptr1 = inptr0 + ldin;
- const uint8_t *inptr2 = inptr1 + ldin;
- const uint8_t *inptr3 = inptr2 + ldin;
- const uint8_t *inptr4 = inptr3 + ldin;
- const uint8_t *inptr5 = inptr4 + ldin;
- const uint8_t *inptr6 = inptr5 + ldin;
- const uint8_t *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;(x>31) || first;x-=32) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- /* 'first' forces this to always run at least once, needed if the total size is <=32. */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- case 6:
- inptr1 = zerobuff;
- // fall through
- case 5:
- inptr2 = zerobuff;
- // fall through
- case 4:
- inptr3 = zerobuff;
- // fall through
- case 3:
- inptr4 = zerobuff;
- // fall through
- case 2:
- inptr5 = zerobuff;
- // fall through
- case 1:
- inptr6 = zerobuff;
- // fall through
- case 0:
- inptr7 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- if (first) {
- if (x<=31) {
- break;
- }
-
- first = false;
- }
-
- __asm __volatile (
- // Load up 8 elements (2 vectors) from each of 8 sources.
- "LDP q0, q1, [%[inptr0]], #32\n" // q0=A0A1A2A3
- "LDP q2, q3, [%[inptr1]], #32\n" // q2=B0B1B2B3
- "LDP q4, q5, [%[inptr2]], #32\n" // q4=C0C1C2C3
- "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
- ASM_PREFETCH("[%[inptr0], #128]")
- "LDP q6, q7, [%[inptr3]], #32\n" // q6=D0D1D2D3
- "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
- "LDP q8, q9, [%[inptr4]], #32\n"
- "LDP q10, q11, [%[inptr5]], #32\n"
- "LDP q12, q13, [%[inptr6]], #32\n"
- "ZIP1 v18.4s, v8.4s, v12.4s\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "LDP q14, q15, [%[inptr7]], #32\n"
- "ZIP1 v19.4s, v10.4s, v14.4s\n"
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
- ASM_PREFETCH("[%[inptr2], #128]")
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP2 v16.4s, v0.4s, v4.4s\n"
- ASM_PREFETCH("[%[inptr3], #128]")
- "ZIP2 v17.4s, v2.4s, v6.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
- "ZIP2 v18.4s, v8.4s, v12.4s\n"
- "ZIP2 v19.4s, v10.4s, v14.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP1 v16.4s, v1.4s, v5.4s\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "ZIP1 v17.4s, v3.4s, v7.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Third element
-
- "ZIP1 v18.4s, v9.4s, v13.4s\n"
- "ZIP1 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Fourth element
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- ASM_PREFETCH("[%[inptr6], #128]")
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP2 v16.4s, v1.4s, v5.4s\n"
- "ZIP2 v17.4s, v3.4s, v7.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Fifth element
-
- "ZIP2 v18.4s, v9.4s, v13.4s\n"
- ASM_PREFETCH("[%[inptr7], #128]")
- "ZIP2 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Sixth element
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Seventh element
-
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Eighth element
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
- );
- }
-
- // Copy any leftover blocks of 4 a complete block at a time.
- for (;x>4;x-=4) {
- t::copy_4_inc(outptr, inptr0);
- t::copy_4_inc(outptr, inptr1);
- t::copy_4_inc(outptr, inptr2);
- t::copy_4_inc(outptr, inptr3);
- t::copy_4_inc(outptr, inptr4);
- t::copy_4_inc(outptr, inptr5);
- t::copy_4_inc(outptr, inptr6);
- t::copy_4_inc(outptr, inptr7);
- }
-
- // Final block with padding, if any.
- if (x > 0) {
- t::copy_pad(outptr, inptr0, x);
- t::copy_pad(outptr, inptr1, x);
- t::copy_pad(outptr, inptr2, x);
- t::copy_pad(outptr, inptr3, x);
- t::copy_pad(outptr, inptr4, x);
- t::copy_pad(outptr, inptr5, x);
- t::copy_pad(outptr, inptr6, x);
- t::copy_pad(outptr, inptr7, x);
- }
- }
-}
-
-#endif // __aarch64__ && !__ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
deleted file mode 100644
index a342d6c3d1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_half_to_float.hpp
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 4, 2, false>::Transform(float *out, const __fp16 *in, int ldin, int y0, int ymax, int k0, int kmax) {
- float *outptr = out;
- const __fp16 *inptr = in;
- bool first = true;
-
- __fp16 zerobuff[16] = { 0 }; // 8 for asm loop plus up to 7 for overflow loop
-
- for (int y=y0; y<ymax; y+=8) {
- const __fp16 *inptr0 = inptr + y * ldin + k0;
- const __fp16 *inptr1 = inptr0 + ldin;
- const __fp16 *inptr2 = inptr1 + ldin;
- const __fp16 *inptr3 = inptr2 + ldin;
- const __fp16 *inptr4 = inptr3 + ldin;
- const __fp16 *inptr5 = inptr4 + ldin;
- const __fp16 *inptr6 = inptr5 + ldin;
- const __fp16 *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;(x>7) || first;x-=8) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- /* 'first' forces this to always run at least once, needed if the total size is <=7. */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- case 6:
- inptr1 = zerobuff;
- // fall through
- case 5:
- inptr2 = zerobuff;
- // fall through
- case 4:
- inptr3 = zerobuff;
- // fall through
- case 3:
- inptr4 = zerobuff;
- // fall through
- case 2:
- inptr5 = zerobuff;
- // fall through
- case 1:
- inptr6 = zerobuff;
- // fall through
- case 0:
- inptr7 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- if (first) {
- if (x<=7) {
- break;
- }
-
- first = false;
- }
-
- __asm __volatile (
- // Load up 8 elements (2 vectors) from each of 8 sources.
- "LDR q0, [%[inptr0]], #16\n"
- "LDR q2, [%[inptr1]], #16\n"
- "FCVTL2 v1.4s, v0.8h\n"
- "FCVTL v0.4s, v0.4h\n"
- "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
- "FCVTL2 v3.4s, v2.8h\n"
- "FCVTL v2.4s, v2.4h\n"
- "FCVTL2 v5.4s, v4.8h\n"
- "FCVTL v4.4s, v4.4h\n"
- "ZIP1 v16.4s, v0.4s, v4.4s\n" // q16=A0C0A1C1
- ASM_PREFETCH("[%[inptr0], #128]")
- "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
- "FCVTL2 v7.4s, v6.8h\n"
- "FCVTL v6.4s, v6.4h\n"
- "ZIP1 v17.4s, v2.4s, v6.4s\n" // q17=B0D0B1D1
- "LDR q8, [%[inptr4]], #16\n"
- "LDR q10, [%[inptr5]], #16\n"
- "FCVTL2 v9.4s, v8.8h\n"
- "FCVTL v8.4s, v8.4h\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "LDR q12, [%[inptr6]], #16\n"
- "FCVTL2 v11.4s, v10.8h\n"
- "FCVTL v10.4s, v10.4h\n"
- "FCVTL2 v13.4s, v12.8h\n"
- "FCVTL v12.4s, v12.4h\n"
- "ZIP1 v18.4s, v8.4s, v12.4s\n"
- "LDR q14, [%[inptr7]], #16\n"
- "FCVTL2 v15.4s, v14.8h\n"
- "FCVTL v14.4s, v14.4h\n"
- "ZIP1 v19.4s, v10.4s, v14.4s\n"
-
- ASM_PREFETCH("[%[inptr2], #128]")
- "ZIP1 v20.4s, v16.4s, v17.4s\n" // q20=A0B0C0D0
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr3], #128]")
-
- "ZIP2 v16.4s, v0.4s, v4.4s\n"
- "ZIP2 v17.4s, v2.4s, v6.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Write back the first element of each source
-
- "ZIP2 v18.4s, v8.4s, v12.4s\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "ZIP2 v19.4s, v10.4s, v14.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Write back the second element of each source
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP1 v16.4s, v1.4s, v5.4s\n"
- "ZIP1 v17.4s, v3.4s, v7.4s\n"
- ASM_PREFETCH("[%[inptr6], #128]")
- "STP q20, q21, [%[outptr]], #32\n" // Third element
-
- "ZIP1 v18.4s, v9.4s, v13.4s\n"
- "ZIP1 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Fourth element
- ASM_PREFETCH("[%[inptr7], #128]")
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
-
- "ZIP2 v16.4s, v1.4s, v5.4s\n"
- "ZIP2 v17.4s, v3.4s, v7.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Fifth element
-
- "ZIP2 v18.4s, v9.4s, v13.4s\n"
- "ZIP2 v19.4s, v11.4s, v15.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Sixth element
-
- "ZIP1 v20.4s, v16.4s, v17.4s\n"
- "ZIP1 v21.4s, v18.4s, v19.4s\n"
- "STP q20, q21, [%[outptr]], #32\n" // Seventh element
-
- "ZIP2 v22.4s, v16.4s, v17.4s\n"
- "ZIP2 v23.4s, v18.4s, v19.4s\n"
- "STP q22, q23, [%[outptr]], #32\n" // Eighth element
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "memory"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- *outptr++ = *inptr6++;
- *outptr++ = *inptr7++;
- }
- }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
deleted file mode 100644
index 37344a82a9..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_s8_to_s16.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-#include <cstdint>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(int16_t *out, const int8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
- int16_t *outptr = out;
- const int8_t *inptr = in;
- bool first = true;
-
- int8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
-
- for (int y=y0; y<ymax; y+=8) {
- const int8_t *inptr0 = inptr + y * ldin + k0;
- const int8_t *inptr1 = inptr0 + ldin;
- const int8_t *inptr2 = inptr1 + ldin;
- const int8_t *inptr3 = inptr2 + ldin;
- const int8_t *inptr4 = inptr3 + ldin;
- const int8_t *inptr5 = inptr4 + ldin;
- const int8_t *inptr6 = inptr5 + ldin;
- const int8_t *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;(x>15) || first;x-=16) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- /* 'first' forces this to always run at least once, needed if the total size is <=7. */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- case 6:
- inptr1 = zerobuff;
- // fall through
- case 5:
- inptr2 = zerobuff;
- // fall through
- case 4:
- inptr3 = zerobuff;
- // fall through
- case 3:
- inptr4 = zerobuff;
- // fall through
- case 2:
- inptr5 = zerobuff;
- // fall through
- case 1:
- inptr6 = zerobuff;
- // fall through
- case 0:
- inptr7 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- if (first) {
- if (x<=15) {
- break;
- }
-
- first = false;
- }
-
- __asm __volatile (
- // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
- "LDR q0, [%[inptr0]], #16\n"
- "LDR q2, [%[inptr1]], #16\n"
- "SSHLL2 v1.8h, v0.16b, #0\n"
- "SSHLL v0.8h, v0.8b, #0\n"
- "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
- "SSHLL2 v3.8h, v2.16b, #0\n"
- "SSHLL v2.8h, v2.8b, #0\n"
- "SSHLL2 v5.8h, v4.16b, #0\n"
- "SSHLL v4.8h, v4.8b, #0\n"
- "ZIP1 v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
- ASM_PREFETCH("[%[inptr0], #128]")
- "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
- "SSHLL2 v7.8h, v6.16b, #0\n"
- "SSHLL v6.8h, v6.8b, #0\n"
- "ZIP1 v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
- "LDR q8, [%[inptr4]], #16\n"
- "LDR q10, [%[inptr5]], #16\n"
- "SSHLL2 v9.8h, v8.16b, #0\n"
- "SSHLL v8.8h, v8.8b, #0\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "LDR q12, [%[inptr6]], #16\n"
- "SSHLL2 v11.8h, v10.16b, #0\n"
- "SSHLL v10.8h, v10.8b, #0\n"
- "SSHLL2 v13.8h, v12.16b, #0\n"
- "SSHLL v12.8h, v12.8b, #0\n"
- "ZIP1 v18.8h, v8.8h, v12.8h\n"
- "LDR q14, [%[inptr7]], #16\n"
- "SSHLL2 v15.8h, v14.16b, #0\n"
- "SSHLL v14.8h, v14.8b, #0\n"
- "ZIP1 v19.8h, v10.8h, v14.8h\n"
-
- ASM_PREFETCH("[%[inptr2], #128]")
- "ZIP1 v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
- "ZIP1 v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
- "ZIP2 v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
- "ZIP2 v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
- ASM_PREFETCH("[%[inptr3], #128]")
-
- "ZIP2 v16.8h, v0.8h, v4.8h\n"
- "ZIP2 v17.8h, v2.8h, v6.8h\n"
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP2 v18.8h, v8.8h, v12.8h\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "ZIP2 v19.8h, v10.8h, v14.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
-
- "ZIP1 v20.8h, v16.8h, v17.8h\n"
- "ZIP1 v21.8h, v18.8h, v19.8h\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "ZIP2 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v23.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
-
- "ZIP1 v16.8h, v1.8h, v5.8h\n"
- "ZIP1 v17.8h, v3.8h, v7.8h\n"
- ASM_PREFETCH("[%[inptr6], #128]")
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP1 v18.8h, v9.8h, v13.8h\n"
- "ZIP1 v19.8h, v11.8h, v15.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Third element
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
- ASM_PREFETCH("[%[inptr7], #128]")
-
- "ZIP1 v20.8h, v16.8h, v17.8h\n"
- "ZIP1 v21.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Fourth element
- "ZIP2 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v23.8h, v18.8h, v19.8h\n"
-
- "ZIP2 v16.8h, v1.8h, v5.8h\n"
- "ZIP2 v17.8h, v3.8h, v7.8h\n"
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP2 v18.8h, v9.8h, v13.8h\n"
- "ZIP2 v19.8h, v11.8h, v15.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Fifth element
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
-
- "ZIP1 v20.8h, v16.8h, v17.8h\n"
- "ZIP1 v21.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Sixth element
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP2 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v23.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Seventh element
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
- "STP q24, q25, [%[outptr]], #32\n" // Eighth element
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- *outptr++ = *inptr6++;
- *outptr++ = *inptr7++;
- }
- }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
deleted file mode 100644
index a3a269c9cd..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_interleave_8way_u8_to_u16.hpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
-
-#include <arm_neon.h>
-#include <cstdint>
-
-#include "../asmlib.hpp"
-
-template<>
-template<>
-inline void TransformImpl<8, 1, false, 2, 1, false>::Transform(uint16_t *out, const uint8_t *in, int ldin, int y0, int ymax, int k0, int kmax) {
- uint16_t *outptr = out;
- const uint8_t *inptr = in;
- bool first = true;
-
- uint8_t zerobuff[32] = { 0 }; // 16 for asm loop plus up to 15 for overflow loop
-
- for (int y=y0; y<ymax; y+=8) {
- const uint8_t *inptr0 = inptr + y * ldin + k0;
- const uint8_t *inptr1 = inptr0 + ldin;
- const uint8_t *inptr2 = inptr1 + ldin;
- const uint8_t *inptr3 = inptr2 + ldin;
- const uint8_t *inptr4 = inptr3 + ldin;
- const uint8_t *inptr5 = inptr4 + ldin;
- const uint8_t *inptr6 = inptr5 + ldin;
- const uint8_t *inptr7 = inptr6 + ldin;
-
- prefetch_2x(inptr0);
- prefetch_2x(inptr1);
- prefetch_2x(inptr2);
- prefetch_2x(inptr3);
- prefetch_2x(inptr4);
- prefetch_2x(inptr5);
- prefetch_2x(inptr6);
- prefetch_2x(inptr7);
-
- int x=(kmax-k0);
- for (;(x>15) || first;x-=16) {
- /* Cope with ragged cases by copying from a buffer of zeroes instead */
- /* 'first' forces this to always run at least once, needed if the total size is <=7. */
- if ((y + 7) >= ymax) {
- switch ((y + 7) - ymax) {
- case 6:
- inptr1 = zerobuff;
- // fall through
- case 5:
- inptr2 = zerobuff;
- // fall through
- case 4:
- inptr3 = zerobuff;
- // fall through
- case 3:
- inptr4 = zerobuff;
- // fall through
- case 2:
- inptr5 = zerobuff;
- // fall through
- case 1:
- inptr6 = zerobuff;
- // fall through
- case 0:
- inptr7 = zerobuff;
- break;
-
- default:
- UNREACHABLE("Impossible.");
- }
- }
-
- if (first) {
- if (x<=15) {
- break;
- }
-
- first = false;
- }
-
- __asm __volatile (
- // Load up 16 elements (1 source vector, 2 destination vectors) from each of 8 sources.
- "LDR q0, [%[inptr0]], #16\n"
- "LDR q2, [%[inptr1]], #16\n"
- "USHLL2 v1.8h, v0.16b, #0\n"
- "USHLL v0.8h, v0.8b, #0\n"
- "LDR q4, [%[inptr2]], #16\n" // q4=C0C1C2C3
- "USHLL2 v3.8h, v2.16b, #0\n"
- "USHLL v2.8h, v2.8b, #0\n"
- "USHLL2 v5.8h, v4.16b, #0\n"
- "USHLL v4.8h, v4.8b, #0\n"
- "ZIP1 v16.8h, v0.8h, v4.8h\n" // q16=A0C0A1C1
- ASM_PREFETCH("[%[inptr0], #128]")
- "LDR q6, [%[inptr3]], #16\n" // q6=D0D1D2D3
- "USHLL2 v7.8h, v6.16b, #0\n"
- "USHLL v6.8h, v6.8b, #0\n"
- "ZIP1 v17.8h, v2.8h, v6.8h\n" // q17=B0D0B1D1
- "LDR q8, [%[inptr4]], #16\n"
- "LDR q10, [%[inptr5]], #16\n"
- "USHLL2 v9.8h, v8.16b, #0\n"
- "USHLL v8.8h, v8.8b, #0\n"
- ASM_PREFETCH("[%[inptr1], #128]")
- "LDR q12, [%[inptr6]], #16\n"
- "USHLL2 v11.8h, v10.16b, #0\n"
- "USHLL v10.8h, v10.8b, #0\n"
- "USHLL2 v13.8h, v12.16b, #0\n"
- "USHLL v12.8h, v12.8b, #0\n"
- "ZIP1 v18.8h, v8.8h, v12.8h\n"
- "LDR q14, [%[inptr7]], #16\n"
- "USHLL2 v15.8h, v14.16b, #0\n"
- "USHLL v14.8h, v14.8b, #0\n"
- "ZIP1 v19.8h, v10.8h, v14.8h\n"
-
- ASM_PREFETCH("[%[inptr2], #128]")
- "ZIP1 v20.8h, v16.8h, v17.8h\n" // q20=A0B0C0D0A1B1C1D1
- "ZIP1 v21.8h, v18.8h, v19.8h\n" // q21=E0F0G0H0E1F1G1H1
- "ZIP2 v22.8h, v16.8h, v17.8h\n" // q22=A2B2C2D2A3B3C3D3
- "ZIP2 v23.8h, v18.8h, v19.8h\n" // q23=E2F2G2H1E3F3G3H3
- ASM_PREFETCH("[%[inptr3], #128]")
-
- "ZIP2 v16.8h, v0.8h, v4.8h\n"
- "ZIP2 v17.8h, v2.8h, v6.8h\n"
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP2 v18.8h, v8.8h, v12.8h\n"
- ASM_PREFETCH("[%[inptr4], #128]")
- "ZIP2 v19.8h, v10.8h, v14.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Write back the first element of each source
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
-
- "ZIP1 v20.8h, v16.8h, v17.8h\n"
- "ZIP1 v21.8h, v18.8h, v19.8h\n"
- ASM_PREFETCH("[%[inptr5], #128]")
- "ZIP2 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v23.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Write back the second element of each source
-
- "ZIP1 v16.8h, v1.8h, v5.8h\n"
- "ZIP1 v17.8h, v3.8h, v7.8h\n"
- ASM_PREFETCH("[%[inptr6], #128]")
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP1 v18.8h, v9.8h, v13.8h\n"
- "ZIP1 v19.8h, v11.8h, v15.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Third element
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
- ASM_PREFETCH("[%[inptr7], #128]")
-
- "ZIP1 v20.8h, v16.8h, v17.8h\n"
- "ZIP1 v21.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Fourth element
- "ZIP2 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v23.8h, v18.8h, v19.8h\n"
-
- "ZIP2 v16.8h, v1.8h, v5.8h\n"
- "ZIP2 v17.8h, v3.8h, v7.8h\n"
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP2 v18.8h, v9.8h, v13.8h\n"
- "ZIP2 v19.8h, v11.8h, v15.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Fifth element
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
-
- "ZIP1 v20.8h, v16.8h, v17.8h\n"
- "ZIP1 v21.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Sixth element
- "TRN1 v24.2d, v20.2d, v21.2d\n"
- "TRN2 v25.2d, v20.2d, v21.2d\n"
-
- "ZIP2 v22.8h, v16.8h, v17.8h\n"
- "ZIP2 v23.8h, v18.8h, v19.8h\n"
- "STP q24, q25, [%[outptr]], #32\n" // Seventh element
- "TRN1 v24.2d, v22.2d, v23.2d\n"
- "TRN2 v25.2d, v22.2d, v23.2d\n"
- "STP q24, q25, [%[outptr]], #32\n" // Eighth element
- : [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3),
- [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7), [outptr] "+r" (outptr)
- :
- : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12",
- "v13", "v14", "v15", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "memory"
- );
- }
-
- for (;x>0;x--) {
- *outptr++ = *inptr0++;
- *outptr++ = *inptr1++;
- *outptr++ = *inptr2++;
- *outptr++ = *inptr3++;
- *outptr++ = *inptr4++;
- *outptr++ = *inptr5++;
- *outptr++ = *inptr6++;
- *outptr++ = *inptr7++;
- }
- }
-}
-
-#endif // __aarch64__ && __ARM_FP16_ARGS
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
index 5ab5774751..f6233ef503 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_16bit.hpp
@@ -30,12 +30,12 @@
// Generic unblocked transposed 6x32-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<6, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<6, 1, true, 4, 4, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
// Redirect to a 12 x uint16_t specialisation
- TransformImpl<12, 1, true, 2, 2, false>::Transform(
+ TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
reinterpret_cast<uint16_t *>(out),
reinterpret_cast<const uint16_t *>(in),
stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<6, 1, true, 4, 4, false>::Transform(
// Generic 12x16-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<12, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
@@ -135,7 +135,7 @@ inline void TransposeInterleaveCommon<12, uint16_t, uint16_t>::moveblock_1x4(con
template <>
template <>
-inline void TransformImpl<12, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 2, 2, VLType::None>::Transform(
uint16_t* out, const uint16_t* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
index d7de9ff934..c0f3e17d31 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_12way_half_to_float.hpp
@@ -110,7 +110,7 @@ inline void TransposeInterleaveCommon<12, __fp16, float>::moveblock_1x4(const __
template <>
template <>
-inline void TransformImpl<12, 1, true, 4, 2, false>::Transform(
+inline void TransformImpl<12, 1, true, 4, 2, VLType::None>::Transform(
float* out, const __fp16* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
index a137f9360a..bcbe2b84d8 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_24way_16bit.hpp
@@ -30,12 +30,12 @@
// Generic unblocked transposed 12x32-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<12, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<12, 1, true, 4, 4, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
// Redirect to a 24 x uint16_t specialisation
- TransformImpl<24, 1, true, 2, 2, false>::Transform(
+ TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
reinterpret_cast<uint16_t *>(out),
reinterpret_cast<const uint16_t * const>(in),
stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<12, 1, true, 4, 4, false>::Transform(
// Generic 24x16-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<24, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
@@ -120,7 +120,7 @@ inline void TransposeInterleaveCommon<24, uint16_t, uint16_t>::moveblock_1x4(con
template <>
template <>
-inline void TransformImpl<24, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<24, 1, true, 2, 2, VLType::None>::Transform(
uint16_t* out, const uint16_t* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
index 974be481e7..df68740bb4 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/a64_transpose_interleave_8way_32bit.hpp
@@ -30,12 +30,12 @@
// Generic unblocked transposed 8x32-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
+inline void TransformImpl<8, 1, true, 4, 4, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
// Redirect to a 16 x uint16_t specialisation
- TransformImpl<16, 1, true, 2, 2, false>::Transform(
+ TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
reinterpret_cast<uint16_t *>(out),
reinterpret_cast<const uint16_t *>(in),
stride*2, x0*2, xmax*2, k0, kmax
@@ -45,7 +45,7 @@ inline void TransformImpl<8, 1, true, 4, 4, false>::Transform(
// Generic 16x16-bit sized specialisation
template <>
template <typename T>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
T* out, const T* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
@@ -137,7 +137,7 @@ inline void TransposeInterleaveCommon<16, uint16_t, uint16_t>::moveblock_1x4(con
template <>
template <>
-inline void TransformImpl<16, 1, true, 2, 2, false>::Transform(
+inline void TransformImpl<16, 1, true, 2, 2, VLType::None>::Transform(
uint16_t* out, const uint16_t* const in, const int stride,
const int x0, const int xmax, const int k0, const int kmax
) {
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
index b825e1c358..e092c729ba 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/list.hpp
@@ -21,22 +21,8 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#include "a32_interleave_6way_32bit.hpp"
#include "a32_transpose_interleave_8way_32bit.hpp"
-#include "a64_block16_interleave4_8bit.hpp"
-#include "a64_interleave_8way_16bit.hpp"
-#include "a64_interleave_8way_32bit.hpp"
-#include "a64_interleave_8way_block4_8bit.hpp"
-#include "a64_interleave_8way_half_to_float.hpp"
-#include "a64_interleave_8way_s8_to_s16.hpp"
-#include "a64_interleave_8way_u8_to_u16.hpp"
#include "a64_transpose_interleave_12way_16bit.hpp"
#include "a64_transpose_interleave_12way_half_to_float.hpp"
#include "a64_transpose_interleave_24way_16bit.hpp"
#include "a64_transpose_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_32bit.hpp"
-#include "sve_interleave_8way_block2_16bit.hpp"
-#include "sve_interleave_8way_block2_32bit.hpp"
-#include "sve_interleave_8way_block4_16bit.hpp"
-#include "sve_interleave_8way_block4_8bit.hpp"
-#include "sve_interleave_8way_block8_8bit.hpp"
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
deleted file mode 100644
index 348d78e3f5..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_32bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 1, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
- uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
- const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
- for (int y=y0; y<ymax; y+=8)
- {
- const int height = ymax-y;
- const long inwidth = (kmax - k0);
- const long outwidth = inwidth * 8;
- long inpos = 0;
- long outpos = 0;
-
- uint32_t *outptr = master_outptr;
- master_outptr += outwidth;
-
- const uint32_t *inptr0 = inptr + y * ldin + k0;
- const uint32_t *inptr1 = inptr0 + ldin;
- const uint32_t *inptr2 = inptr1 + ldin;
- const uint32_t *inptr3 = inptr2 + ldin;
- const uint32_t *inptr4 = inptr3 + ldin;
- const uint32_t *inptr5 = inptr4 + ldin;
- const uint32_t *inptr6 = inptr5 + ldin;
- const uint32_t *inptr7 = inptr6 + ldin;
-
- switch(height)
- {
- case 1:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip1 z0.s, z8.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z1.s, z8.s, z4.s\n"
- "zip1 z2.s, z9.s, z4.s\n"
- "zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z4.s\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.s, z3.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 2:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip1 z0.s, z8.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z1.s, z8.s, z4.s\n"
- "zip1 z2.s, z9.s, z4.s\n"
- "zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "zip1 z6.s, z11.s, z14.s\n"
- "zip2 z7.s, z11.s, z14.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 3:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "zip1 z6.s, z11.s, z14.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z7.s, z11.s, z14.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 4:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z4.s\n"
- "zip2 z15.s, z3.s, z4.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 5:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z5.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z5.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z5.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z5.s\n"
- "zip2 z15.s, z3.s, z5.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 6:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z6.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z6.s\n"
- "zip2 z15.s, z3.s, z6.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 7:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z7.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- default:
- case 8:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
- "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
-
- }
- }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp
deleted file mode 100644
index 234433a0f1..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_16bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 2, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
- uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
- const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
-
- for (int y=y0; y<ymax; y+=8)
- {
- const int height = ymax-y;
- const long inwidth = (kmax - k0);
- const long outwidth = ((inwidth + 1) / 2) * 16;
- long inpos = 0;
- long outpos = 0;
-
- uint16_t *outptr = master_outptr;
- master_outptr += outwidth;
-
- const uint16_t *inptr0 = inptr + y * ldin + k0;
- const uint16_t *inptr1 = inptr0 + ldin;
- const uint16_t *inptr2 = inptr1 + ldin;
- const uint16_t *inptr3 = inptr2 + ldin;
- const uint16_t *inptr4 = inptr3 + ldin;
- const uint16_t *inptr5 = inptr4 + ldin;
- const uint16_t *inptr6 = inptr5 + ldin;
- const uint16_t *inptr7 = inptr6 + ldin;
-
- switch(height)
- {
- case 1:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip1 z0.s, z8.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z1.s, z8.s, z4.s\n"
- "zip1 z2.s, z9.s, z4.s\n"
- "zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z4.s\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.s, z3.s, z4.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 2:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "mov z14.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip1 z0.s, z8.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z1.s, z8.s, z4.s\n"
- "zip1 z2.s, z9.s, z4.s\n"
- "zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "zip1 z6.s, z11.s, z14.s\n"
- "zip2 z7.s, z11.s, z14.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 3:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "mov z14.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "zip1 z6.s, z11.s, z14.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z7.s, z11.s, z14.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 4:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z4.s\n"
- "zip2 z15.s, z3.s, z4.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 5:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z5.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z5.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z5.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z5.s\n"
- "zip2 z15.s, z3.s, z5.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 6:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z6.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z6.s\n"
- "zip2 z15.s, z3.s, z6.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 7:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z7.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
- "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- default:
- case 8:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
- "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
- "ld1h z7.h, p0/z, [%[inptr7], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
-
- }
- }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
deleted file mode 100644
index f21933b8de..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block2_32bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 2, false, 4, 4, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
- uint32_t *master_outptr = reinterpret_cast<uint32_t *>(out);
- const uint32_t *inptr = reinterpret_cast<const uint32_t *>(in);
-
- for (int y=y0; y<ymax; y+=8)
- {
- const int height = ymax-y;
- const long inwidth = (kmax - k0);
- const long outwidth = ((inwidth + 1) / 2) * 16;
- long inpos = 0;
- long outpos = 0;
-
- uint32_t *outptr = master_outptr;
- master_outptr += outwidth;
-
- const uint32_t *inptr0 = inptr + y * ldin + k0;
- const uint32_t *inptr1 = inptr0 + ldin;
- const uint32_t *inptr2 = inptr1 + ldin;
- const uint32_t *inptr3 = inptr2 + ldin;
- const uint32_t *inptr4 = inptr3 + ldin;
- const uint32_t *inptr5 = inptr4 + ldin;
- const uint32_t *inptr6 = inptr5 + ldin;
- const uint32_t *inptr7 = inptr6 + ldin;
-
- switch(height)
- {
- case 1:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip1 z0.d, z8.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z1.d, z8.d, z4.d\n"
- "zip1 z2.d, z9.d, z4.d\n"
- "zip2 z3.d, z9.d, z4.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z4.d\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.d, z3.d, z4.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 2:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip1 z0.d, z8.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z1.d, z8.d, z4.d\n"
- "zip1 z2.d, z9.d, z4.d\n"
- "zip2 z3.d, z9.d, z4.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "zip1 z6.d, z11.d, z14.d\n"
- "zip2 z7.d, z11.d, z14.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 3:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "mov z14.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "zip1 z6.d, z11.d, z14.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z7.d, z11.d, z14.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 4:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z4.d\n"
- "zip2 z15.d, z3.d, z4.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 5:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z5.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z5.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z5.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z5.d\n"
- "zip2 z15.d, z3.d, z5.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 6:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z6.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z6.d\n"
- "zip2 z15.d, z3.d, z6.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 7:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z7.s, #0\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- default:
- case 8:
- __asm __volatile(
- "1:\n"
- "whilelt p0.s, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "ld1w z0.s, p0/z, [%[inptr0], %[inpos], LSL #2]\n"
- "ld1w z1.s, p0/z, [%[inptr1], %[inpos], LSL #2]\n"
- "ld1w z2.s, p0/z, [%[inptr2], %[inpos], LSL #2]\n"
- "ld1w z3.s, p0/z, [%[inptr3], %[inpos], LSL #2]\n"
- "ld1w z4.s, p0/z, [%[inptr4], %[inpos], LSL #2]\n"
- "ld1w z5.s, p0/z, [%[inptr5], %[inpos], LSL #2]\n"
- "ld1w z6.s, p0/z, [%[inptr6], %[inpos], LSL #2]\n"
- "ld1w z7.s, p0/z, [%[inptr7], %[inpos], LSL #2]\n"
- "incw %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p1.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.s, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.s, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.s, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1w z8.s, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.s, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1w z9.s, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incw %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1w z10.s, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.s, %[outpos], %[outwidth]\n"
- "st1w z11.s, p3, [%[outptr], #3, MUL VL]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z12.s, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.s, %[outpos], %[outwidth]\n"
- "incw %[outpos], all, mul #1\n"
- "st1w z13.s, p5, [%[outptr], #5, MUL VL]\n"
- "st1w z14.s, p6, [%[outptr], #6, MUL VL]\n"
- "st1w z15.s, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
-
- }
- }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp
deleted file mode 100644
index 26e10511a6..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_16bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 2, 2, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
- uint16_t *master_outptr = reinterpret_cast<uint16_t *>(out);
- const uint16_t *inptr = reinterpret_cast<const uint16_t *>(in);
-
- for (int y=y0; y<ymax; y+=8)
- {
- const int height = ymax-y;
- const long inwidth = (kmax - k0);
- const long outwidth = ((inwidth + 3) / 4) * 32;
- long inpos = 0;
- long outpos = 0;
-
- uint16_t *outptr = master_outptr;
- master_outptr += outwidth;
-
- const uint16_t *inptr0 = inptr + y * ldin + k0;
- const uint16_t *inptr1 = inptr0 + ldin;
- const uint16_t *inptr2 = inptr1 + ldin;
- const uint16_t *inptr3 = inptr2 + ldin;
- const uint16_t *inptr4 = inptr3 + ldin;
- const uint16_t *inptr5 = inptr4 + ldin;
- const uint16_t *inptr6 = inptr5 + ldin;
- const uint16_t *inptr7 = inptr6 + ldin;
-
- switch(height)
- {
- case 1:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip1 z0.d, z8.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z1.d, z8.d, z4.d\n"
- "zip1 z2.d, z9.d, z4.d\n"
- "zip2 z3.d, z9.d, z4.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z4.d\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.d, z3.d, z4.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 2:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "mov z14.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip1 z0.d, z8.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z1.d, z8.d, z4.d\n"
- "zip1 z2.d, z9.d, z4.d\n"
- "zip2 z3.d, z9.d, z4.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "zip1 z6.d, z11.d, z14.d\n"
- "zip2 z7.d, z11.d, z14.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 3:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "mov z14.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "zip1 z6.d, z11.d, z14.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z7.d, z11.d, z14.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 4:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z4.d\n"
- "zip2 z15.d, z3.d, z4.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 5:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z5.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z5.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z5.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z5.d\n"
- "zip2 z15.d, z3.d, z5.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 6:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z6.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z6.d\n"
- "zip2 z15.d, z3.d, z6.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 7:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z7.h, #0\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
- "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- default:
- case 8:
- __asm __volatile(
- "1:\n"
- "whilelt p0.h, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "ld1h z0.h, p0/z, [%[inptr0], %[inpos], LSL #1]\n"
- "ld1h z1.h, p0/z, [%[inptr1], %[inpos], LSL #1]\n"
- "ld1h z2.h, p0/z, [%[inptr2], %[inpos], LSL #1]\n"
- "ld1h z3.h, p0/z, [%[inptr3], %[inpos], LSL #1]\n"
- "ld1h z4.h, p0/z, [%[inptr4], %[inpos], LSL #1]\n"
- "ld1h z5.h, p0/z, [%[inptr5], %[inpos], LSL #1]\n"
- "ld1h z6.h, p0/z, [%[inptr6], %[inpos], LSL #1]\n"
- "ld1h z7.h, p0/z, [%[inptr7], %[inpos], LSL #1]\n"
- "inch %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p1.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.h, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.h, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.h, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1h z8.h, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.h, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1h z9.h, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "inch %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1h z10.h, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.h, %[outpos], %[outwidth]\n"
- "st1h z11.h, p3, [%[outptr], #3, MUL VL]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z12.h, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.h, %[outpos], %[outwidth]\n"
- "inch %[outpos], all, mul #1\n"
- "st1h z13.h, p5, [%[outptr], #5, MUL VL]\n"
- "st1h z14.h, p6, [%[outptr], #6, MUL VL]\n"
- "st1h z15.h, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
-
- }
- }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
deleted file mode 100644
index ed0d58aa91..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block4_8bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 4, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
- uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
- const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-
- for (int y=y0; y<ymax; y+=8)
- {
- const int height = ymax-y;
- const long inwidth = (kmax - k0);
- const long outwidth = ((inwidth + 3) / 4) * 32;
- long inpos = 0;
- long outpos = 0;
-
- uint8_t *outptr = master_outptr;
- master_outptr += outwidth;
-
- const uint8_t *inptr0 = inptr + y * ldin + k0;
- const uint8_t *inptr1 = inptr0 + ldin;
- const uint8_t *inptr2 = inptr1 + ldin;
- const uint8_t *inptr3 = inptr2 + ldin;
- const uint8_t *inptr4 = inptr3 + ldin;
- const uint8_t *inptr5 = inptr4 + ldin;
- const uint8_t *inptr6 = inptr5 + ldin;
- const uint8_t *inptr7 = inptr6 + ldin;
-
- switch(height)
- {
- case 1:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip1 z0.s, z8.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z1.s, z8.s, z4.s\n"
- "zip1 z2.s, z9.s, z4.s\n"
- "zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z4.s\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.s, z3.s, z4.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 2:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "mov z14.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip1 z0.s, z8.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z1.s, z8.s, z4.s\n"
- "zip1 z2.s, z9.s, z4.s\n"
- "zip2 z3.s, z9.s, z4.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "zip1 z6.s, z11.s, z14.s\n"
- "zip2 z7.s, z11.s, z14.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 3:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "mov z14.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "zip1 z6.s, z11.s, z14.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z7.s, z11.s, z14.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 4:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z4.s\n"
- "zip2 z11.s, z1.s, z4.s\n"
- "zip1 z12.s, z2.s, z4.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z4.s\n"
- "zip2 z15.s, z3.s, z4.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 5:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z5.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z5.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z5.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z5.s\n"
- "zip2 z15.s, z3.s, z5.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 6:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z6.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z6.s\n"
- "zip2 z15.s, z3.s, z6.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 7:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z7.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
- "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- default:
- case 8:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
- "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
- "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "zip1 z0.s, z8.s, z12.s\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.s, z8.s, z12.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.s, z9.s, z13.s\n"
- "zip2 z3.s, z9.s, z13.s\n"
- "zip1 z4.s, z10.s, z14.s\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.s, z10.s, z14.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.s, z11.s, z15.s\n"
- "zip2 z7.s, z11.s, z15.s\n"
- "zip1 z8.s, z0.s, z4.s\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.s, z0.s, z4.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.s, z1.s, z5.s\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.s, z1.s, z5.s\n"
- "zip1 z12.s, z2.s, z6.s\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.s, z2.s, z6.s\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.s, z3.s, z7.s\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.s, z3.s, z7.s\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
-
- }
- }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp b/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp
deleted file mode 100644
index b4935e6417..0000000000
--- a/src/core/NEON/kernels/arm_gemm/transforms/sve_interleave_8way_block8_8bit.hpp
+++ /dev/null
@@ -1,596 +0,0 @@
-/*
- * Copyright (c) 2019 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#pragma once
-
-#ifdef __ARM_FEATURE_SVE
-
-template<>
-template<typename T>
-inline void TransformImpl<8, 8, false, 1, 1, false>::Transform(T *out, const T *in, int ldin, int y0, int ymax, int k0, int kmax)
-{
- uint8_t *master_outptr = reinterpret_cast<uint8_t *>(out);
- const uint8_t *inptr = reinterpret_cast<const uint8_t *>(in);
-
- for (int y=y0; y<ymax; y+=8)
- {
- const int height = ymax-y;
- const long inwidth = (kmax - k0);
- const long outwidth = ((inwidth + 7) / 8) * 64;
- long inpos = 0;
- long outpos = 0;
-
- uint8_t *outptr = master_outptr;
- master_outptr += outwidth;
-
- const uint8_t *inptr0 = inptr + y * ldin + k0;
- const uint8_t *inptr1 = inptr0 + ldin;
- const uint8_t *inptr2 = inptr1 + ldin;
- const uint8_t *inptr3 = inptr2 + ldin;
- const uint8_t *inptr4 = inptr3 + ldin;
- const uint8_t *inptr5 = inptr4 + ldin;
- const uint8_t *inptr6 = inptr5 + ldin;
- const uint8_t *inptr7 = inptr6 + ldin;
-
- switch(height)
- {
- case 1:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip1 z0.d, z8.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z1.d, z8.d, z4.d\n"
- "zip1 z2.d, z9.d, z4.d\n"
- "zip2 z3.d, z9.d, z4.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z4.d\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.d, z3.d, z4.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 2:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "mov z14.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip1 z0.d, z8.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z1.d, z8.d, z4.d\n"
- "zip1 z2.d, z9.d, z4.d\n"
- "zip2 z3.d, z9.d, z4.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "zip1 z6.d, z11.d, z14.d\n"
- "zip2 z7.d, z11.d, z14.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 3:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "mov z14.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "zip1 z6.d, z11.d, z14.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z7.d, z11.d, z14.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 4:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z4.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z4.d\n"
- "zip2 z11.d, z1.d, z4.d\n"
- "zip1 z12.d, z2.d, z4.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z4.d\n"
- "zip2 z15.d, z3.d, z4.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 5:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z5.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z5.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z5.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z5.d\n"
- "zip2 z15.d, z3.d, z5.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 6:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z6.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z6.d\n"
- "zip2 z15.d, z3.d, z6.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- case 7:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "mov z7.b, #0\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
- "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
- default:
- case 8:
- __asm __volatile(
- "1:\n"
- "whilelt p0.b, %[inpos], %[inwidth]\n"
- "b.none 2f\n"
- "ld1b z0.b, p0/z, [%[inptr0], %[inpos]]\n"
- "ld1b z1.b, p0/z, [%[inptr1], %[inpos]]\n"
- "ld1b z2.b, p0/z, [%[inptr2], %[inpos]]\n"
- "ld1b z3.b, p0/z, [%[inptr3], %[inpos]]\n"
- "ld1b z4.b, p0/z, [%[inptr4], %[inpos]]\n"
- "ld1b z5.b, p0/z, [%[inptr5], %[inpos]]\n"
- "ld1b z6.b, p0/z, [%[inptr6], %[inpos]]\n"
- "ld1b z7.b, p0/z, [%[inptr7], %[inpos]]\n"
- "incb %[inpos], all, mul #1\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p0.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p1.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "zip1 z0.d, z8.d, z12.d\n"
- "whilelt p2.b, %[outpos], %[outwidth]\n"
- "zip2 z1.d, z8.d, z12.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z2.d, z9.d, z13.d\n"
- "zip2 z3.d, z9.d, z13.d\n"
- "zip1 z4.d, z10.d, z14.d\n"
- "whilelt p3.b, %[outpos], %[outwidth]\n"
- "zip2 z5.d, z10.d, z14.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z6.d, z11.d, z15.d\n"
- "zip2 z7.d, z11.d, z15.d\n"
- "zip1 z8.d, z0.d, z4.d\n"
- "whilelt p4.b, %[outpos], %[outwidth]\n"
- "zip2 z9.d, z0.d, z4.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip1 z10.d, z1.d, z5.d\n"
- "st1b z8.b, p0, [%[outptr]]\n"
- "zip2 z11.d, z1.d, z5.d\n"
- "zip1 z12.d, z2.d, z6.d\n"
- "whilelt p5.b, %[outpos], %[outwidth]\n"
- "zip2 z13.d, z2.d, z6.d\n"
- "st1b z9.b, p1, [%[outptr], #1, MUL VL]\n"
- "zip1 z14.d, z3.d, z7.d\n"
- "incb %[outpos], all, mul #1\n"
- "zip2 z15.d, z3.d, z7.d\n"
- "st1b z10.b, p2, [%[outptr], #2, MUL VL]\n"
- "whilelt p6.b, %[outpos], %[outwidth]\n"
- "st1b z11.b, p3, [%[outptr], #3, MUL VL]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z12.b, p4, [%[outptr], #4, MUL VL]\n"
- "whilelt p7.b, %[outpos], %[outwidth]\n"
- "incb %[outpos], all, mul #1\n"
- "st1b z13.b, p5, [%[outptr], #5, MUL VL]\n"
- "st1b z14.b, p6, [%[outptr], #6, MUL VL]\n"
- "st1b z15.b, p7, [%[outptr], #7, MUL VL]\n"
- "addvl %[outptr], %[outptr], #8\n"
- "b 1b\n"
- "2:\n"
- : [inpos] "+r" (inpos), [outpos] "+r" (outpos), [outptr] "+r" (outptr), [inptr0] "+r" (inptr0), [inptr1] "+r" (inptr1), [inptr2] "+r" (inptr2), [inptr3] "+r" (inptr3), [inptr4] "+r" (inptr4), [inptr5] "+r" (inptr5), [inptr6] "+r" (inptr6), [inptr7] "+r" (inptr7)
- : [outwidth] "r" (outwidth), [inwidth] "r" (inwidth)
- : "p0", "p1", "p2", "p3", "p4", "p5", "p6", "p7", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z13", "z14", "z15", "cc", "memory"
- );
- break;
-
-
- }
- }
-}
-
-#endif // __ARM_FEATURE_SVE
diff --git a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
index aac5e19ebe..a3216c494f 100644
--- a/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
+++ b/src/core/NEON/kernels/arm_gemm/transforms/transpose_interleave_common.hpp
@@ -23,6 +23,8 @@
*/
#pragma once
+#include "../asmlib.hpp"
+
template <unsigned int IntBy, typename TIn, typename TOut>
struct TransposeInterleaveCommon {
// Override the moveblock_1xY methods to improve performance
diff --git a/src/core/NEON/kernels/arm_gemm/utils.hpp b/src/core/NEON/kernels/arm_gemm/utils.hpp
index 6e47a97c78..6d483a3b9d 100644
--- a/src/core/NEON/kernels/arm_gemm/utils.hpp
+++ b/src/core/NEON/kernels/arm_gemm/utils.hpp
@@ -24,6 +24,8 @@
#pragma once
+#include "arm_gemm.hpp"
+
#include <cstddef>
// Macro for unreachable code (e.g. impossible default cases on switch)
@@ -32,6 +34,8 @@
// Paranoid option for the above with assert
// #define UNREACHABLE(why) assert(0 && why)
+namespace arm_gemm {
+
template<typename T>
inline T iceildiv(const T a, const T b) {
return (a + b - 1) / b;
@@ -48,7 +52,94 @@ inline T roundup(const T a, const T b) {
}
}
-namespace arm_gemm {
+enum class VLType {
+ None,
+ SVE,
+};
+
+template<typename T>
+struct IndirectOutputArg {
+ struct {
+ T *base;
+ size_t stride;
+ } direct = {};
+ struct {
+ T * const *ptr;
+ size_t offset;
+ } indirect = {};
+ bool is_indirect;
+
+ // Direct
+ IndirectOutputArg(T *base, size_t stride) : is_indirect(false) {
+ direct.base = base;
+ direct.stride = stride;
+ }
+
+ // Indirect
+ IndirectOutputArg(T * const * ptr, size_t offset) : is_indirect(true) {
+ indirect.ptr = ptr;
+ indirect.offset = offset;
+ }
+
+ IndirectOutputArg() : is_indirect(false) {
+ direct.base = nullptr;
+ direct.stride = 0;
+ }
+};
+
+// Check that the provided Requantize32 doesn't have a left shift.
+inline bool quant_no_left_shift(const Requantize32 &qp) {
+ if (qp.per_channel_requant) {
+ return (qp.per_channel_left_shifts == nullptr);
+ } else {
+ return (qp.per_layer_left_shift == 0);
+ }
+}
+
+// Check that the provided Requantize32 is compatible with the "symmetric" hybrid kernels. These don't include row
+// sums, so the 'b_offset' has to be zero.
+inline bool quant_hybrid_symmetric(const Requantize32 &qp) {
+ return quant_no_left_shift(qp) && qp.b_offset == 0;
+}
+
+// Check that the provided Requantize32 is compatible with the "asymmetric" hybrid kernels. These don't support per
+// channel quantization. Technically b_offset==0 cases would work, but it is a waste to sum and then multiply by 0...
+inline bool quant_hybrid_asymmetric(const Requantize32 &qp) {
+ return quant_no_left_shift(qp) /* && qp.b_offset != 0 */ && qp.per_channel_requant==false;
+}
+
+template<typename T>
+struct IndirectInputArg {
+ struct {
+ const T *base;
+ size_t stride;
+ } direct = {};
+ struct {
+ const T * const * const * ptr;
+ unsigned int start_row;
+ unsigned int start_col;
+ } indirect = {};
+ bool is_indirect;
+
+ // Direct
+ IndirectInputArg(const T *base, size_t stride) : is_indirect(false) {
+ direct.base = base;
+ direct.stride = stride;
+ }
+
+ // Indirect
+ IndirectInputArg(const T * const * const *ptr, unsigned int start_row, unsigned int start_col) : is_indirect(true) {
+ indirect.ptr = ptr;
+ indirect.start_row = start_row;
+ indirect.start_col = start_col;
+ }
+
+ IndirectInputArg() : is_indirect(false) {
+ direct.base = nullptr;
+ direct.stride = 0;
+ }
+};
+
namespace utils {
namespace {
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
deleted file mode 100644
index 760274dba1..0000000000
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/WindowIterator.h"
-
-using namespace arm_compute;
-
-INEGEMMWrapperKernel::INEGEMMWrapperKernel()
- : _a(nullptr), _b(nullptr), _c(nullptr), _params(), _gemm_info(), _window3d(), _window_shape()
-{
-}
-
-INEGEMMWrapperKernel::Params INEGEMMWrapperKernel::extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info)
-{
- Params p;
-
- ARM_COMPUTE_ERROR_ON_NULLPTR(a);
- ARM_COMPUTE_ERROR_ON_NULLPTR(b);
- ARM_COMPUTE_ERROR_ON_NULLPTR(c);
-
- // Initalize params
- p.M = c->info()->tensor_shape().y();
- p.N = c->info()->tensor_shape().x();
- p.K = a->info()->tensor_shape().x();
- p.multis = b->info()->tensor_shape().z();
- p.batches = c->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
-
- // Update M in case of GEMM3D for output
- if(gemm_info.depth_output_gemm3d() != 0)
- {
- p.M = c->info()->tensor_shape().y() * c->info()->tensor_shape().z();
- p.batches = c->info()->tensor_shape().total_size_upper(3) / p.multis;
- }
-
- return p;
-}
-
-void INEGEMMWrapperKernel::configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info)
-{
- _gemm_info = gemm_info;
- _params = extract_parameters(a, b, c, gemm_info);
- _a = a;
- _b = b;
- _c = c;
-
- _window3d = configure_internal(alpha, beta);
- _window_shape = _window3d.shape();
-
- // Convert the 3D window into a 1D window in order to allow the scheduler to arbitrary split it.
- Window collapsed;
- collapsed.set(0, Window::Dimension(0, _window3d.num_iterations_total()));
-
- INEKernel::configure(collapsed);
-}
-
-void INEGEMMWrapperKernel::run(const Window &window, const ThreadInfo &info)
-{
- const Coordinates start_offset = index2coords(_window_shape, window.x().start());
- const Coordinates end_offset = index2coords(_window_shape, window.x().end() - 1);
-
- run_internal(_window3d, start_offset, end_offset, info);
-}
diff --git a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h b/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
deleted file mode 100644
index 92c013260b..0000000000
--- a/src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef SRC_INEGEMMWRAPPERKERNEL_H
-#define SRC_INEGEMMWRAPPERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Common interface for all the arm_gemm Gemms
- */
-class INEGEMMWrapperKernel : public INEKernel
-{
-public:
- /** Parameters defining the dimensions of the matrices being multiplied */
- struct Params
- {
- unsigned int M{ 0 }; /**< Rows in output matrix C (and input matrix A). */
- unsigned int N{ 0 }; /**< Columns in output matrix C (and input matrix B). */
- unsigned int K{ 0 }; /**< Columns of input matrix A (= rows of input matrix B). */
- unsigned int batches{ 0 }; /**< Number of "batched" GEMMs (unique A and C, shared B). */
- unsigned int multis{ 0 }; /**< Number of "multi" GEMMs (unique A, B and C). */
- };
-
- static Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *c, const GEMMInfo &gemm_info);
-
- /** Constructor */
- INEGEMMWrapperKernel();
- /** Prevent instances of this class from being copied */
- INEGEMMWrapperKernel(const INEGEMMWrapperKernel &) = delete;
- /** Prevent instances of this class from being copied */
- INEGEMMWrapperKernel &operator=(const INEGEMMWrapperKernel &) = delete;
- /** Allow instances of this class to be moved */
- INEGEMMWrapperKernel(INEGEMMWrapperKernel &&) = default;
- /** Allow instances of this class to be moved */
- INEGEMMWrapperKernel &operator=(INEGEMMWrapperKernel &&) = default;
- /** Initialise the kernel's input and output.
- *
- * @note The input and output tensor must have the same dimensions
- *
- * @param[in] a Input tensor (Matrix A)
- * @param[in] b Input tensor (Matrix B)
- * @param[out] c Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
- * @param[in] alpha Scalar multiplier to apply to AB matrix product.
- * @param[in] beta Scalar multiplier to apply to input C matrix before adding product.
- * @param[in] gemm_info GEMM meta-data
- */
- void configure(const ITensor *a, const ITensor *b, ITensor *c, float alpha, float beta, const GEMMInfo &gemm_info);
-
- // Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-protected:
- /** Called as part of configure() after _a, _b, _c and _params have been set.
- *
- * @param[in] alpha Scalar multiplier to apply to AB matrix product.
- * @param[in] beta Scalar multiplier to apply to input C matrix before adding product.
- *
- * @return A 3D execution window.
- */
- virtual Window configure_internal(float alpha, float beta) = 0;
-
- /** Run the kernel from the start to the end offset in window.
- *
- * @param[in] window Window to use for the iteration
- * @param[in] start_offset Where to start iterating from (In Window coordinates)
- * @param[in] end_offset Where to stop iterating (In Window coordinates).
- * @param[in] info Info about executing thread and CPU.
- */
- virtual void run_internal(const Window &window, const Coordinates &start_offset, const Coordinates &end_offset, const ThreadInfo &info) = 0;
-
- const ITensor *_a;
- const ITensor *_b;
- ITensor *_c;
- Params _params;
- GEMMInfo _gemm_info;
-
-private:
- Window _window3d;
- TensorShape _window_shape;
-};
-
-} // namespace arm_compute
-
-#endif /* SRC_INEGEMMRAPPERKERNEL_H */
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
index f6421c12ab..3088b080d6 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -43,7 +43,9 @@ enum class GemmMethod
GEMM_INTERLEAVED_2D,
QUANTIZE_WRAPPER,
QUANTIZE_WRAPPER_2D,
- GEMM_HYBRID_QUANTIZED
+ GEMM_HYBRID_QUANTIZED,
+ INDIRECT_GEMM,
+ CONVOLUTION_GEMM
};
struct KernelDescription
@@ -104,17 +106,19 @@ public:
unsigned int _Msize;
unsigned int _Nsize;
unsigned int _Ksize;
+ unsigned int _Ksections;
unsigned int _nbatches;
unsigned int _nmulti;
+ bool _indirect_input;
Activation _act;
int _maxthreads;
const GemmConfig *_cfg;
- GemmArgs(const CPUInfo *ci, const unsigned int M, const unsigned int N,
- const unsigned int K, const unsigned int nbatches,
- const unsigned int nmulti, Activation act, const int maxthreads,
+ GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
+ unsigned int K, unsigned int Ksections, unsigned int nbatches,
+ unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
const GemmConfig *cfg = nullptr)
- : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _act(act), _maxthreads(maxthreads), _cfg(cfg)
+ : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads), _cfg(cfg)
{
}
};
@@ -143,8 +147,8 @@ public:
Requantize32(const int32_t *bias, size_t bias_multi_stride,
int32_t a_offset, int32_t b_offset, int32_t c_offset,
int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
- : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max(requant_shift, int32_t(0))),
- per_layer_right_shift(std::min(requant_shift, int32_t(0))), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
+ : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+ per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
{
}
diff --git a/src/core/NEON/kernels/assembly/convolution_parameters.hpp b/src/core/NEON/kernels/assembly/convolution_parameters.hpp
new file mode 100644
index 0000000000..d0ef5b539f
--- /dev/null
+++ b/src/core/NEON/kernels/assembly/convolution_parameters.hpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2018-2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include <cstdint>
+
+namespace arm_gemm
+{
+/*
+ * Parameter set for "convolution" type GEMM.
+ *
+ * For a "convolution" GEMM, the GEMM parameters (M, K) are specified as if
+ * an im2row had been performed on the input tensor to generate the operand
+ * matrix, but instead this structure describes the convolution parameters
+ * such that this can be done on the fly.
+ *
+ * The parameters describe the convolution details - the notional shape of
+ * the input and output tensors, whether padding is to be applied, the size
+ * of the kernel and a constant value to be used for padding (needed for
+ * quantized tensors).
+ *
+ * The second part describes the layout of the input tensor in memory, which
+ * is assumed to be in NHWC format. This consists of a base pointer and
+ * strides for columns, rows and batches. 'multis' are not supported for
+ * convolution type GEMMs.
+ */
+struct ConvolutionParameters
+{
+ int64_t input_width;
+ int64_t input_height;
+ int64_t input_channels;
+ int64_t kernel_width;
+ int64_t kernel_height;
+ int64_t output_width;
+ int64_t output_height;
+ int64_t output_stride_w;
+ int64_t output_stride_h;
+ // output_channels not included as they do not affect the input.
+ int64_t padding_top;
+ int64_t padding_left;
+ float padding_value;
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp
index e9e56842c7..e1fb7a45a8 100644
--- a/src/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/src/core/NEON/kernels/assembly/gemm_common.hpp
@@ -23,6 +23,7 @@
*/
#pragma once
+#include "convolution_parameters.hpp"
#include "ndrange.hpp"
#include <cstddef>
@@ -77,7 +78,7 @@ public:
return false;
}
- /** Main execute member function
+ /** Main execute member fucntion
* @param [in] work_range specifies the range of work we want to be computed, total range defined by get_window_size()
* @param [in] thread_locator where are we inside of the thread space
* @naram [in] threadid a unique threadid
@@ -123,6 +124,19 @@ public:
{
}
+ /*** Indirect interface (optional) ***/
+ /* Set the indirect table. This comprises a number of values per kernel point, and a densely packed array of pointers,
+ * multis * batches * kernel_points */
+ virtual void set_indirect_parameters_generic(size_t, const void *const *const *)
+ {
+ }
+
+ /*** Convolution interface (optional) ***/
+ /* Set the convolution parameters. */
+ virtual void set_convolution_parameters(ConvolutionParameters)
+ {
+ }
+
// Destructor
virtual ~IGemmCommon()
{
@@ -200,6 +214,16 @@ public:
{
pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
}
+
+ /*** Indirect interface ***/
+ virtual void set_indirect_parameters(size_t, const To *const *const *)
+ {
+ }
+
+ void set_indirect_parameters_generic(size_t sz, const void *const *const *ptr) override
+ {
+ set_indirect_parameters(sz, reinterpret_cast<const To *const *const *>(ptr));
+ }
};
} // namespace arm_gemm
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 901b1e880e..cc5f160787 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -27,27 +27,12 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NECopyKernel.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h"
-#include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
-#include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
-#include "src/core/NEON/kernels/NEFFTScaleKernel.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
-#include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
+
#include "support/MemorySupport.h"
#include <cmath>
@@ -71,6 +56,7 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
enable_fast_math, num_groups));
+ const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
@@ -87,6 +73,13 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
_function = std::move(f);
break;
}
+ case ConvolutionMethod::GEMM_CONV2D:
+ {
+ auto f = arm_compute::support::cpp14::make_unique<NEGEMMConv2d>(_memory_manager);
+ f->configure(input, weights, biases, output, info);
+ _function = std::move(f);
+ break;
+ }
case ConvolutionMethod::DIRECT:
{
auto f = arm_compute::support::cpp14::make_unique<NEDirectConvolutionLayer>(_memory_manager);
@@ -112,22 +105,22 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
{
ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on NEON");
+ const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
{
case ConvolutionMethod::WINOGRAD:
- //Validate Winograd
ARM_COMPUTE_RETURN_ON_ERROR(NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
break;
case ConvolutionMethod::GEMM:
- //Validate Gemm-based Convolution
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConvolutionLayer::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info));
break;
+ case ConvolutionMethod::GEMM_CONV2D:
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMConv2d::validate(input, weights, biases, output, info));
+ break;
case ConvolutionMethod::DIRECT:
- //Validate Direct Convolution
ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
break;
case ConvolutionMethod::FFT:
- // Validate FFT-based convolution layer
ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info));
break;
default:
@@ -149,6 +142,8 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+ const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, 1);
+
/* Input spatial dims, kernel size, IFM/OFM, conv info*/
using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
@@ -235,7 +230,21 @@ ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *
}
}
#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
- return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+ // For 1x1 convolutions run the default GEMM
+ if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+ {
+ return ConvolutionMethod::GEMM;
+ }
+
+ if(bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+ {
+ return ConvolutionMethod::WINOGRAD;
+ }
+ if(bool(NEGEMMConv2d::validate(input, weights, nullptr, output, info)))
+ {
+ return ConvolutionMethod::GEMM_CONV2D;
+ }
+ return ConvolutionMethod::GEMM;
}
}
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 0215098792..9f52e458d2 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -47,7 +47,19 @@ using namespace arm_compute::misc::shape_calculator;
namespace arm_compute
{
-NEGEMM::~NEGEMM() = default;
+namespace
+{
+AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+ AsmGemmInfo asm_info;
+ asm_info.method = AsmConvMethod::Im2Col;
+ asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+ asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
+ asm_info.activation_info = info.activation_info();
+
+ return asm_info;
+}
+} // namespace
NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
: _memory_group(memory_manager), _weights_manager(weights_manager), _interleave_kernel(), _transpose_kernel(), _mm_kernel(), _asm_glue(memory_manager, weights_manager), _ma_kernel(),
@@ -56,12 +68,15 @@ NEGEMM::NEGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *
{
}
+NEGEMM::~NEGEMM() = default;
+
void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
{
ARM_COMPUTE_ERROR_THROW_ON(NEGEMM::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
- const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
- bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), gemm_info));
+ const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool is_c_bias = gemm_info.reshape_b_only_on_first_run();
+ bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a->info(), b->info(), (is_c_bias && c != nullptr) ? c->info() : nullptr, d->info(), asm_info));
// Check if we need to reshape the matrix B only on the first run
_is_prepared = false;
@@ -76,7 +91,7 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
if(run_optimised)
{
const ITensor *c_to_use = is_c_bias ? c : nullptr;
- _asm_glue.configure(a, b, c_to_use, d, gemm_info);
+ _asm_glue.configure(a, b, c_to_use, d, asm_info);
ARM_COMPUTE_ERROR_ON(!_asm_glue.is_configured());
// Scale product by alpha
@@ -221,7 +236,8 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
}
// Check if we need to run the optimized assembly kernel
- const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, gemm_info));
+ AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+ const bool run_optimised = bool(NEGEMMAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, output, asm_info));
if(!run_optimised)
{
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index 5b0848398d..400fa64438 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -25,18 +25,70 @@
#include "arm_compute/runtime/NEON/NEScheduler.h"
#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
#include "src/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h"
#include "src/core/NEON/kernels/assembly/arm_gemm.hpp"
#include "support/MemorySupport.h"
#include <arm_neon.h>
+#include <cstdlib>
namespace arm_compute
{
namespace
{
+struct free_delete
+{
+ void operator()(void *x)
+ {
+ free(x);
+ }
+};
+
+struct Params
+{
+ unsigned int M;
+ unsigned int N;
+ unsigned int K;
+ unsigned int batches;
+ unsigned int multis;
+ unsigned int sections;
+ bool indirect;
+};
+
+Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d, const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
+
+ Params p;
+ p.K = a->info()->tensor_shape().x();
+ p.N = d->info()->tensor_shape().x();
+ p.multis = 1;
+ p.indirect = false;
+ p.sections = 1;
+
+ if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+ {
+ p.indirect = true;
+ p.sections = b->info()->tensor_shape()[2] * b->info()->tensor_shape()[3];
+ }
+ else
+ {
+ p.M = d->info()->tensor_shape().y();
+ p.multis = b->info()->tensor_shape().z();
+ p.batches = d->info()->tensor_shape().total_size_upper(2) / p.multis; //COMPMID-1423: Agree on and document the layout of gemm inputs/outputs
+ }
+
+ // Update M in case of GEMM3D for output
+ if(info.depth_output_gemm3d != 0)
+ {
+ p.M = d->info()->tensor_shape().y() * d->info()->tensor_shape().z();
+ p.batches = d->info()->tensor_shape().total_size_upper(3) / p.multis;
+ }
+
+ return p;
+}
+
arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
{
arm_gemm::Activation gemm_act;
@@ -69,6 +121,29 @@ arm_gemm::Activation map_to_arm_gemm_activation(const ActivationLayerInfo &act)
return gemm_act;
}
+IScheduler::Hints scheduling_hint_heuristic(arm_gemm::GemmMethod method, DataType data_type)
+{
+ // Schedule assembly kernel
+ const int granule_threshold = 200;
+ IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
+ if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+ {
+ scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+ }
+ else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+ {
+ //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
+ scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ }
+ else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+ {
+ //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
+ scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+ }
+
+ return scheduling_hint;
+}
+
template <typename TypeInput, typename TypeOutput>
class FallbackTransform : public ITransformWeights
{
@@ -165,7 +240,7 @@ public:
* @param[in] os Output stage meta-data.
*/
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
- arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+ arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
/** Set requantization shifts to be used
@@ -198,6 +273,16 @@ private:
* @param[in] alignment Workspace memory alignment.
*/
void allocate_workspace(size_t workspace_size, MemoryGroup &memory_group, size_t alignment);
+ /** Configure the indirect buffer
+ *
+ * @param[in] a Input tensor containing the Matrix A.
+ * @param[in] b Input tensor containing the Matrix B.
+ * @param[out] d Output tensor to store the result of matrix multiplication.
+ * @param[in] info GEMM meta-data
+ */
+ void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
+ /** Prepare the indirect buffer */
+ void prepare_indirect_buffer();
/** Assembly Gemm kernel */
std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
@@ -226,7 +311,7 @@ private:
/** Prepared flag */
bool _is_prepared{ false };
/** GEMM meta-data */
- GEMMInfo _gemm_info{};
+ AsmGemmInfo _gemm_info{};
/** Weights manager */
IWeightsManager *_weights_manager{ nullptr };
/** Weights transform object */
@@ -239,11 +324,16 @@ private:
std::vector<int32_t> left_shifts{};
/** Per channel quantization multipliers */
std::vector<int32_t> _multipliers{};
+ /** Indirect buffer */
+ std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
+ std::unique_ptr<const TypeInput *, free_delete> _indirect_buf{};
+ std::vector<TypeInput> _indirect_pad{};
+ arm_gemm::ConvolutionParameters _cp{};
};
template <typename TypeInput, typename TypeOutput, class OutputStage>
-std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
- const std::vector<int32_t> &multipliers)
+std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
{
_multipliers = multipliers;
_shifts = shifts;
@@ -261,8 +351,122 @@ std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> Fallback<Typ
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer()
+{
+ const TypeInput *A_ptr = reinterpret_cast<TypeInput *>(_a->buffer());
+ const int multis = 1;
+ const int batches = _a->info()->tensor_shape().total_size_upper(3);
+ const size_t stride_A = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ const size_t batch_stride_A = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+ const size_t multi_stride_A = _a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
+
+ const size_t output_hw = _cp.output_height * _cp.output_width;
+ const int batch_size = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
+ const size_t batch_stride = batch_size / sizeof(TypeInput);
+ const int multi_size = batch_size * batches;
+ const size_t multi_stride = multi_size / sizeof(TypeInput);
+
+ for(int64_t m = 0; m < multis; m++)
+ {
+ for(int64_t b = 0; b < batches; b++)
+ {
+ for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+ {
+ for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+ {
+ int64_t output_xy = (output_y * _cp.output_width) + output_x;
+
+ for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+ {
+ for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+ {
+ int64_t input_x = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
+ int64_t input_y = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
+ int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
+ int64_t input_xy = (input_y * _cp.input_width) + input_x;
+
+ if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+ {
+ _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+ }
+ else
+ {
+ _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+ A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
+
+ float zeropad = 0.f;
+ if(is_data_type_quantized(a->data_type()))
+ {
+ zeropad = a->quantization_info().uniform().offset;
+ }
+
+ const int64_t input_width = static_cast<int64_t>(a->tensor_shape()[1]);
+ const int64_t input_height = static_cast<int64_t>(a->tensor_shape()[2]);
+ const int64_t input_channels = static_cast<int64_t>(a->tensor_shape()[0]);
+ const int64_t kernel_width = static_cast<int64_t>(b->tensor_shape()[2]);
+ const int64_t kernel_height = static_cast<int64_t>(b->tensor_shape()[3]);
+ const int64_t output_width = static_cast<int64_t>(d->tensor_shape()[1]);
+ const int64_t output_height = static_cast<int64_t>(d->tensor_shape()[2]);
+
+ _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
+ info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
+ };
+
+ if(info.method == AsmConvMethod::Conv)
+ {
+ _gemm_kernel_asm->set_convolution_parameters(_cp);
+ }
+
+ if(info.method == AsmConvMethod::Indirect)
+ {
+ const unsigned int multis = 1;
+ const unsigned int batches = a->tensor_shape().total_size_upper(3);
+ const unsigned int kernel_hw = _cp.kernel_width * _cp.kernel_height;
+ const unsigned int output_hw = _cp.output_width * _cp.output_height;
+
+ using TypeInputPtr = TypeInput *;
+ const int batch_size = kernel_hw * output_hw * sizeof(TypeInputPtr);
+ const size_t batch_stride = batch_size / sizeof(TypeInputPtr);
+ const int multi_size = batch_size * batches;
+ const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
+
+ _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+ _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+ _indirect_pad = std::vector<TypeInput>(_cp.input_channels, zeropad);
+
+ // Set indirect argument
+ int64_t pos = 0;
+ for(int64_t m = 0; m < multis; m++)
+ {
+ for(int64_t b = 0; b < batches; b++)
+ {
+ for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+ {
+ (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+ }
+ }
+ }
+
+ _gemm_kernel_asm->set_indirect_parameters(a->tensor_shape()[0], _indirect_arg.get());
+ }
+}
+
+template <typename TypeInput, typename TypeOutput, class OutputStage>
void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
- arm_gemm::GemmArgs args, const GEMMInfo &gemm_info,
+ arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
{
arm_gemm::GemmConfig gemm_cfg;
@@ -325,6 +529,12 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
static_cast<Tensor *>(_pretranspose)->allocator()->init(TensorInfo(TensorShape{ (B_pretranspose_size + alignment /* FIXME: remove alignment after COMPMID-1088 */) }, 1, DataType::S8), alignment);
}
}
+
+ // Handle indirect GEMM convolution
+ if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+ {
+ configure_indirect(a->info(), b->info(), d->info(), gemm_info);
+ }
}
template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -365,6 +575,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare()
}
}
+ if(_gemm_info.method == AsmConvMethod::Indirect)
+ {
+ prepare_indirect_buffer();
+ }
+
_is_prepared = true;
}
}
@@ -387,23 +602,23 @@ bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
template <typename TypeInput, typename TypeOutput, class OutputStage>
void Fallback<TypeInput, TypeOutput, OutputStage>::run()
{
- const int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+ int lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
int ldb = 0;
const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
- const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d() != 0 ? 3 : 2;
+ const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
const size_t a_multi_idx = a_batch_idx + 1;
- const size_t d_batch_idx = _gemm_info.depth_output_gemm3d() != 0 ? 3 : 2;
+ const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
const size_t d_multi_idx = d_batch_idx + 1;
- const int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
+ int batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
- const int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
+ int multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
int multi_stride_b = 0;
const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
- const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
+ auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
const TypeInput *in1_ptr = nullptr;
auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
@@ -415,25 +630,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
}
- IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
- if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
- {
- const int granule_threshold = 200;
- scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
- }
- else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (_d->info()->data_type() == DataType::F32 || _d->info()->data_type() == DataType::F16
- || _d->info()->data_type() == DataType::U8 || _d->info()->data_type() == DataType::S8))
- {
- //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
- const int granule_threshold = 200;
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
- }
- else if(_kernel_info.method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (_d->info()->data_type() == DataType::QASYMM8 || _d->info()->data_type() == DataType::QASYMM8_SIGNED))
- {
- //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
- const int granule_threshold = 200;
- scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
- }
+ const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, _d->info()->data_type());
// Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
if(_workspace.buffer() != nullptr)
@@ -458,57 +655,67 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
// Prepare assembly kernel
prepare();
- TypeOutput *bias = nullptr;
// Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
+ TypeOutput *bias = nullptr;
if(_c && _c->info()->data_type() != DataType::S32)
{
bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
}
+
+ if(_gemm_info.method == AsmConvMethod::Indirect)
+ {
+ in0_ptr = nullptr;
+ lda = 0;
+ batch_stride_a = 0;
+ multi_stride_a = 0;
+ }
+
// Set gemm parameters
_gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
in1_ptr, ldb, multi_stride_b,
out_ptr, ldd, batch_stride_d, multi_stride_d,
bias, 0);
- // Schedule assembly kernel
+ // Schedule
NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
}
template <typename TypeInput, typename TypeOutput>
void create_arm_gemm(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
- const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+ const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
IWeightsManager *weights_manager)
{
- INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ Params p = extract_parameters(a, b, d, info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ unsigned int num_threads = NEScheduler::get().num_threads();
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
// Create arm_gemm fallback
auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput>>();
- fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager);
+ fallback->configure(a, b, c, d, args, info, memory_group, weights_manager);
arm_gemm = std::move(fallback);
}
template <typename TypeInput, typename TypeOutput>
void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
- const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const GEMMInfo &gemm_info,
+ const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
IWeightsManager *weights_manager)
{
ARM_COMPUTE_UNUSED(activation);
- INEGEMMWrapperKernel::Params p = INEGEMMWrapperKernel::extract_parameters(a, b, d, gemm_info);
- const CPUInfo &ci = NEScheduler::get().cpu_info();
- unsigned int num_threads = NEScheduler::get().num_threads();
+ Params p = extract_parameters(a, b, d, info);
+ const CPUInfo &ci = NEScheduler::get().cpu_info();
+ unsigned int num_threads = NEScheduler::get().num_threads();
- arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.batches, p.multis, activation, num_threads);
+ arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads);
// Create arm_gemm fallback
auto fallback = support::cpp14::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
// Configure requantization info
- const int32_t a_offset = -a->info()->quantization_info().uniform().offset;
- const int32_t b_offset = -b->info()->quantization_info().uniform().offset;
- const GEMMLowpOutputStageInfo os_info = gemm_info.gemmlowp_output_stage();
+ const int32_t negation = info.negated_offsets ? 1 : -1;
+ const int32_t a_offset = -a->info()->quantization_info().uniform().offset * negation;
+ const int32_t b_offset = -b->info()->quantization_info().uniform().offset * negation;
+ const GEMMLowpOutputStageInfo os_info = info.output_stage;
arm_gemm::Requantize32 gemm_requant_info{};
if(os_info.gemmlowp_shifts.size() > 1)
@@ -530,7 +737,7 @@ void create_arm_gemm_quant(std::unique_ptr<NEGEMMAssemblyDispatch::IFallback> &a
}
// Configure fallback
- fallback->configure(a, b, c, d, args, gemm_info, memory_group, weights_manager, gemm_requant_info);
+ fallback->configure(a, b, c, d, args, info, memory_group, weights_manager, gemm_requant_info);
arm_gemm = std::move(fallback);
}
@@ -541,14 +748,13 @@ NEGEMMAssemblyDispatch::NEGEMMAssemblyDispatch(std::shared_ptr<IMemoryManager> m
{
}
-Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const GEMMInfo &gemm_info)
+Status NEGEMMAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
{
- ARM_COMPUTE_UNUSED(c);
+ ARM_COMPUTE_UNUSED(c, info);
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
- ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.pretranpose_B());
#ifndef __aarch64__
ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
#endif /* __aarch64__ */
@@ -579,13 +785,13 @@ bool NEGEMMAssemblyDispatch::is_activation_supported(const ActivationLayerInfo &
return act.type != arm_gemm::Activation::Type::None;
}
-void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const GEMMInfo &gemm_info)
+void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
- arm_gemm::Activation act = map_to_arm_gemm_activation(gemm_info.activation_info());
+ arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
//If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
- if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), gemm_info))
+ if(!NEGEMMAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info))
{
return;
}
@@ -593,40 +799,40 @@ void NEGEMMAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const
switch(a->info()->data_type())
{
case DataType::F32:
- create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+ create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
break;
#ifdef __aarch64__
case DataType::U8:
case DataType::QASYMM8:
if(d->info()->data_type() == DataType::S32)
{
- create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+ create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
}
else
{
- create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+ create_arm_gemm_quant<uint8_t, uint8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
}
break;
case DataType::S8:
case DataType::QASYMM8_SIGNED:
if(d->info()->data_type() == DataType::S32)
{
- create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+ create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
}
else
{
- create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+ create_arm_gemm_quant<int8_t, int8_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
}
break;
#endif /* __aarch64__ */
#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
case DataType::BFLOAT16:
- create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+ create_arm_gemm<bfloat16, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
break;
#endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
case DataType::F16:
- create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, gemm_info, _weights_manager);
+ create_arm_gemm<float16_t, float16_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
break;
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
default:
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
new file mode 100644
index 0000000000..642b084fb4
--- /dev/null
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2020 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include <set>
+namespace arm_compute
+{
+namespace
+{
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act)
+{
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo iqinfo = input->quantization_info();
+ const QuantizationInfo wqinfo = weights->quantization_info();
+ const QuantizationInfo oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+ const DataType data_type = input->data_type();
+ // Merge activation with output stage
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+ int32_t min_activation = type_min.get<int32_t>();
+ int32_t max_activation = type_max.get<int32_t>();
+ if(supported_acts.count(act.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
+ }
+ GEMMLowpOutputStageInfo os_info;
+ os_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ os_info.gemmlowp_offset = uoqinfo.offset;
+ os_info.gemmlowp_min_bound = min_activation;
+ os_info.gemmlowp_max_bound = max_activation;
+ os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+ quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
+ return os_info;
+}
+AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+{
+ AsmGemmInfo asm_info;
+ asm_info.method = is_indirect ? AsmConvMethod::Indirect : AsmConvMethod::Conv;
+ asm_info.ps_info = info.conv_info;
+ asm_info.activation_info = info.act_info;
+ asm_info.depth_output_gemm3d = true;
+ asm_info.reinterpret_input_as_3d = true;
+ asm_info.padding_top = info.conv_info.pad_top();
+ asm_info.padding_left = info.conv_info.pad_left();
+ asm_info.padding_value = 0.f;
+ asm_info.negated_offsets = false;
+ return asm_info;
+}
+} // namespace
+
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
+ : _gemm_asm_func(memory_manager), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false), _run_activation(false)
+{
+}
+void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConv2d::validate(input->info(),
+ weights->info(),
+ biases != nullptr ? biases->info() : nullptr,
+ output->info(),
+ info));
+ _original_weights = weights;
+ _weights_permute_func.configure(weights, &_permuted_weights, PermutationVector{ 3, 0, 1, 2 });
+
+ // Configure assembly dispatch
+ AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+ if(is_data_type_quantized(input->info()->data_type()))
+ {
+ asm_info.output_stage = calculate_output_stage_metadata(input->info(), weights->info(), output->info(), info.act_info);
+ }
+ _gemm_asm_func.configure(input, &_permuted_weights, biases, output, asm_info);
+
+ // Configure activation
+ if(info.act_info.enabled() && !_gemm_asm_func.is_activation_supported(info.act_info))
+ {
+ _activation_func.configure(output, nullptr, info.act_info);
+ _run_activation = true;
+ }
+}
+Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
+ const DataType data_type = input->data_type();
+ const TensorShape i_shape = input->tensor_shape();
+ const TensorShape w_shape = weights->tensor_shape();
+ ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+ // Validate biases
+ if(biases != nullptr)
+ {
+ if(is_data_type_quantized_asymmetric(data_type))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else if(data_type == DataType::BFLOAT16)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+ ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMAssemblyDispatch::validate(input, weights, biases, output, asm_info));
+ return Status{};
+}
+void NEGEMMConv2d::run()
+{
+ prepare();
+
+ _gemm_asm_func.run();
+ if(_run_activation)
+ {
+ _activation_func.run();
+ }
+}
+void NEGEMMConv2d::prepare()
+{
+ if(!_is_prepared)
+ {
+ _permuted_weights.allocator()->allocate();
+ _weights_permute_func.run();
+ _original_weights->mark_as_unused();
+ _is_prepared = true;
+ }
+}
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
deleted file mode 100644
index 09637dd2d6..0000000000
--- a/src/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
-
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/ITensor.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
-#include "support/MemorySupport.h"
-
-namespace arm_compute
-{
-NEGEMMLowpAssemblyMatrixMultiplyCore::~NEGEMMLowpAssemblyMatrixMultiplyCore() = default;
-
-NEGEMMLowpAssemblyMatrixMultiplyCore::NEGEMMLowpAssemblyMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
- : _memory_group(memory_manager), _asm_glue(memory_manager), _mm_kernel(nullptr), _mtx_a_reshape_kernel(nullptr), _mtx_b_reshape_kernel(nullptr), _tmp_a(), _tmp_b()
-{
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::S8);
- ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
- ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
- ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(0) != (b)->info()->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
- ARM_COMPUTE_ERROR_ON_MSG((a)->info()->dimension(1) != (output)->info()->dimension(1), "The output matrix must have the same number of rows as the matrix A");
- ARM_COMPUTE_ERROR_ON_MSG((b)->info()->dimension(0) != (output)->info()->dimension(0), "The output matrix must have the same number of columns as the matrix B");
-
- bool run_optimised = false;
- switch(a->info()->data_type())
- {
- case DataType::S8:
- case DataType::QASYMM8:
- case DataType::U8:
- {
- _asm_glue.configure(a, b, c, output, GEMMInfo(false, false, true));
- run_optimised = _asm_glue.is_configured();
- break;
- }
- default:
- {
- ARM_COMPUTE_ERROR("Datatype not supported");
- break;
- }
- }
- if(!run_optimised)
- {
- // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
- TensorShape shape_tmp_a = a->info()->tensor_shape();
- shape_tmp_a.set(0, a->info()->dimension(0) * 4);
- shape_tmp_a.set(1, std::ceil(a->info()->dimension(1) / 4.f));
-
- // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
- TensorShape shape_tmp_b = b->info()->tensor_shape();
- shape_tmp_b.set(0, b->info()->dimension(1) * 16);
- shape_tmp_b.set(1, std::ceil(b->info()->dimension(0) / 16.f));
-
- TensorInfo info_a(shape_tmp_a, 1, a->info()->data_type());
- TensorInfo info_b(shape_tmp_b, 1, b->info()->data_type());
- _tmp_a.allocator()->init(info_a);
- _tmp_b.allocator()->init(info_b);
- _memory_group.manage(&_tmp_a);
- _memory_group.manage(&_tmp_b);
-
- // Configure interleave kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMInterleave4x4Kernel>();
- k->configure(a, &_tmp_a);
- _mtx_a_reshape_kernel = std::move(k);
- }
-
- // Configure transpose kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMTranspose1xWKernel>();
- k->configure(b, &_tmp_b);
- _mtx_b_reshape_kernel = std::move(k);
- }
-
- // Configure matrix multiply kernel
- {
- auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpMatrixMultiplyKernel>();
- k->configure(&_tmp_a, &_tmp_b, output);
- _mm_kernel = std::move(k);
- }
-
- // Allocate tensors
- _tmp_a.allocator()->allocate();
- _tmp_b.allocator()->allocate();
- }
-}
-
-void NEGEMMLowpAssemblyMatrixMultiplyCore::run()
-{
- MemoryGroupResourceScope scope_mg(_memory_group);
- if(_mtx_a_reshape_kernel)
- {
- NEScheduler::get().schedule(_mtx_a_reshape_kernel.get(), Window::DimY);
- }
-
- if(_mtx_b_reshape_kernel)
- {
- NEScheduler::get().schedule(_mtx_b_reshape_kernel.get(), Window::DimY);
- }
-
- if(_asm_glue.is_configured())
- {
- _asm_glue.run();
- }
- else
- {
- NEScheduler::get().schedule(_mm_kernel.get(), Window::DimY);
- }
-}
-} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 9050427b34..df8eaacf47 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -47,6 +47,21 @@
namespace arm_compute
{
+namespace
+{
+AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
+{
+ AsmGemmInfo asm_info;
+ asm_info.method = AsmConvMethod::Im2Col;
+ asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+ asm_info.depth_output_gemm3d = info.depth_output_gemm3d();
+ asm_info.activation_info = info.activation_info();
+ asm_info.output_stage = info.gemmlowp_output_stage();
+
+ return asm_info;
+}
+} // namespace
+
using namespace arm_compute::misc::shape_calculator;
NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
@@ -120,6 +135,8 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
_mm_result_s32.allocator()->init(info_mm_result_s32);
}
+ // Initialize assembly kernel meta-data
+ const AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
#ifdef __aarch64__
switch(a->info()->data_type())
{
@@ -130,12 +147,12 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
{
if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- _asm_glue.configure(a_to_use, b, c, output, gemm_info);
+ _asm_glue.configure(a_to_use, b, c, output, asm_info);
_fused_assembly_path = _asm_glue.is_configured();
}
else
{
- _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, gemm_info);
+ _asm_glue.configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, asm_info);
}
_assembly_path = _asm_glue.is_configured();
break;
@@ -346,17 +363,20 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
matrix_a_info = &signed_a;
}
+ // Initialize assembly kernel meta-data
+ const AsmGemmInfo asm_info = init_assembly_metadata(info);
+
// Check if we need to run the optimized assembly kernel
bool run_optimised = false;
bool run_optimised_requantized = false;
if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
run_optimised_requantized = run_optimised;
}
else
{
- run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, gemm_info));
+ run_optimised = bool(NEGEMMAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
}
if(run_optimised)
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp b/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
deleted file mode 100644
index d165b2235c..0000000000
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/runtime/NEON/functions/NESimpleAssemblyFunction.h"
-
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
-using namespace arm_compute;
-
-NESimpleAssemblyFunction::NESimpleAssemblyFunction() // NOLINT
- : _kernel()
-{
-}
-
-void NESimpleAssemblyFunction::run()
-{
- NEScheduler::get().schedule(_kernel.get(), Window::DimX);
-}
-
-void NESimpleAssemblyFunction::configure(std::unique_ptr<INEGEMMWrapperKernel> kernel)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(kernel.get());
- _kernel = std::move(kernel);
- ARM_COMPUTE_ERROR_ON_WINDOW_DIMENSIONS_GTE(_kernel->window(), 1);
-}
diff --git a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h b/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
deleted file mode 100644
index e9be54d35f..0000000000
--- a/src/runtime/NEON/functions/NESimpleAssemblyFunction.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-#define ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H
-
-#include "arm_compute/runtime/IFunction.h"
-#include "src/core/NEON/kernels/assembly/INEGEMMWrapperKernel.h"
-
-#include <memory>
-
-namespace arm_compute
-{
-/** Basic interface for functions which have a single NEON GEMM wrapper kernel to run */
-class NESimpleAssemblyFunction : public IFunction
-{
-public:
- /** Constructor */
- NESimpleAssemblyFunction();
-
- /** Configure the function with the kernel to run
- *
- * @param[in] kernel GEMM Wrapper kernel configured and ready to run
- *
- * @note The kernel is expected to have a 1D window. The function will multi-thread this window across the X dimension.
- */
- void configure(std::unique_ptr<INEGEMMWrapperKernel> kernel);
-
- // Inherited methods overridden:
- void run() override final;
-
-protected:
- std::unique_ptr<INEGEMMWrapperKernel> _kernel; /**< Kernel to run */
-};
-} //namespace arm_compute
-#endif /*ARM_COMPUTE_NESIMPLEASSEMBLYFUNCTION_H */
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 80615c5d57..112188fdfa 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -23,6 +23,7 @@
*/
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/NEON/functions/NEConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMMConv2d.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h"
#include "arm_compute/runtime/NEON/functions/NEWinogradConvolutionLayer.h"
#include "arm_compute/runtime/Tensor.h"
@@ -45,6 +46,20 @@ namespace test
{
namespace validation
{
+namespace detail
+{
+template <>
+void configure_conv_function<NEGEMMConv2d, Tensor>(NEGEMMConv2d &func,
+ Tensor *src, const Tensor *weights, const Tensor *bias, Tensor *dst,
+ const PadStrideInfo &info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+ ARM_COMPUTE_UNUSED(weights_info);
+
+ Conv2dInfo conv_info(info, dilation, act_info, false, num_groups);
+ func.configure(src, weights, bias, dst, conv_info);
+}
+} // namespace detail
namespace
{
const RelativeTolerance<float> rel_tolerance_f32(0.01f); /**< Relative tolerance for FP32 types */
@@ -368,7 +383,7 @@ TEST_SUITE_END() // WinogradLayer
TEST_SUITE(GEMMConvolutionLayer)
template <typename T>
-using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T>;
TEST_SUITE(Float)
#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
@@ -413,10 +428,10 @@ TEST_SUITE_END() // FP32
TEST_SUITE_END() // Float
template <typename T>
-using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T>;
+using NEGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEConvolutionLayer, T>;
template <typename T>
-using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConvolutionLayer, T, int8_t>;
+using NEGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEConvolutionLayer, T, int8_t>;
const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
{
@@ -480,6 +495,82 @@ TEST_SUITE_END() // QSYMM8_PER_CHANNEL
TEST_SUITE_END() // Quantized
TEST_SUITE_END() // GEMMConvolutionLayer
+
+TEST_SUITE(DirectGEMMConv2d)
+template <typename T>
+using NEDirectGEMMConv2dLayerFixture = ConvolutionValidationFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+TEST_SUITE(Float)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+ framework::dataset::make("ReshapeWeights", { true })),
+ framework::dataset::make("DataType", DataType::F32)),
+ framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+ ActivationFunctionsDataset))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, rel_tolerance_f32, 0.f, float(abs_tolerance_f32));
+}
+TEST_SUITE_END() // FP32
+TEST_SUITE_END() // Float
+
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<Tensor, Accessor, NEGEMMConv2d, T>;
+
+template <typename T>
+using NEDirectGEMMConv2dLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<Tensor, Accessor, NEGEMMConv2d, T, int8_t>;
+
+const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
+{
+ ActivationLayerInfo(),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU),
+ ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, 6.f)
+});
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+ framework::dataset::make("ReshapeWeights", { true })),
+ framework::dataset::make("DataType", DataType::QASYMM8)),
+ framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+ framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+ QuantizedActivationFunctionsDataset))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8
+
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEDirectGEMMConv2dLayerQuantizedFixture<int8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+ framework::dataset::make("ReshapeWeights", { true })),
+ framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)),
+ framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+ framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.01f, -10) })),
+ QuantizedActivationFunctionsDataset))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+FIXTURE_DATA_TEST_CASE(RunSmallSigned, NEDirectGEMMConv2dLayerQuantizedPerChannelFixture<int8_t>, framework::DatasetMode::ALL,
+ combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+ framework::dataset::make("ReshapeWeights", { true })),
+ framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+ framework::dataset::make("DataLayout", { DataLayout::NHWC })),
+ QuantizationData),
+ QuantizedActivationFunctionsDataset),
+ framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
+TEST_SUITE_END() // Quantized
+
+TEST_SUITE_END() // DirectGEMMConv2d
+
TEST_SUITE_END() // NEON
} // namespace validation
} // namespace test
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 9fe7e55de7..04282c2c3c 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpAssemblyMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
#include "arm_compute/runtime/Tensor.h"
@@ -53,28 +52,6 @@ const auto data_matrix_multiply = framework::dataset::make("M", 12, 20) * framew
} // namespace
TEST_SUITE(NEON)
-TEST_SUITE(ASSEMBLY_MATRIX_MULTIPLY)
-
-using NEGEMMAssemblyFixture_S8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, int8_t>;
-using NEGEMMAssemblyFixture_U8 = GEMMLowpAssemblyFixture<Tensor, Accessor, NEGEMMLowpAssemblyMatrixMultiplyCore, uint8_t>;
-
-TEST_SUITE(S8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_S8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
- // Validate output
- validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-
-TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMAssemblyFixture_U8, framework::DatasetMode::PRECOMMIT, data_matrix_multiply)
-{
- // Validate output
- validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END()
-TEST_SUITE_END()
-
TEST_SUITE(GEMMLowp)
TEST_SUITE(MatrixMultiplyCore)
using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
diff --git a/tests/validation/fixtures/ConvolutionLayerFixture.h b/tests/validation/fixtures/ConvolutionLayerFixture.h
index ec13e1d3e0..e1452f5dfc 100644
--- a/tests/validation/fixtures/ConvolutionLayerFixture.h
+++ b/tests/validation/fixtures/ConvolutionLayerFixture.h
@@ -42,12 +42,22 @@
namespace arm_compute
{
-class NEConvolutionLayer;
-
namespace test
{
namespace validation
{
+namespace detail
+{
+template <typename ConvolutionFunction, typename TensorType>
+void configure_conv_function(ConvolutionFunction &func,
+ TensorType *src, const TensorType *weights, const TensorType *bias, TensorType *dst,
+ const PadStrideInfo &info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+ func.configure(src, weights, bias, dst, info, weights_info, dilation, act_info, num_groups);
+}
+} // namespace detail
+
template <typename TensorType, typename AccessorType, typename FunctionType, typename T, typename TW>
class ConvolutionValidationGenericFixture : public framework::Fixture
{
@@ -171,7 +181,7 @@ protected:
// Create and configure function
FunctionType conv;
- conv.configure(&src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
+ detail::configure_conv_function(conv, &src, &weights, &bias, &dst, info, weights_info, dilation, act_info, num_groups);
ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
ARM_COMPUTE_EXPECT(weights.info()->is_resizable(), framework::LogLevel::ERRORS);